diff --git a/qcd/part_cpu/LICENCE b/qcd/part_cpu/LICENCE new file mode 100644 index 0000000000000000000000000000000000000000..3eff5c81f79472018c6e4406ee1510fbada7c20e --- /dev/null +++ b/qcd/part_cpu/LICENCE @@ -0,0 +1,42 @@ +Copyright and Disclaimer +Copyright (C) 2008, Forschungszentrum Juelich GmbH, Federal Republic of Germany. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Any publications that result from the use of this software shall + reasonably refer to the Research Centre's development. + + * All advertising materials mentioning features or use of this + software must display the following acknowledgement: + + This product includes software developed by Forschungszentrum + Juelich GmbH, Federal Republic of Germany. + + * Forschungszentrum Juelich GmbH is not obligated to provide the + user with any support, consulting, training or assistance of any + kind with regard to the use, operation and performance of this + software or to provide the user with any updates, revisions or + new versions. + +THIS SOFTWARE IS PROVIDED BY FORSCHUNGSZENTRUM JUELICH GMBH "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL FORSCHUNGSZENTRUM JUELICH +GMBH BE LIABLE FOR ANY SPECIAL, DIRECT OR CONSEQUENTIAL DAMAGES OR ANY +DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE ACCESS, USE OR PERFORMANCE OF +THIS SOFTWARE. diff --git a/qcd/part_cpu/applications/QCD/QUICK_GUIDE_UEABS_QCD_BENCHMARKSUITE b/qcd/part_cpu/applications/QCD/QUICK_GUIDE_UEABS_QCD_BENCHMARKSUITE new file mode 100644 index 0000000000000000000000000000000000000000..138b332d8f10d1b0b554247e1f155edc79a8f609 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/QUICK_GUIDE_UEABS_QCD_BENCHMARKSUITE @@ -0,0 +1,108 @@ +################# +################# UEABS - QCD - BENCHMARKSUITE -- QUICK-USERGUIDE +################# + +This is a very short summary of the general step, which has +to be performed, to run the UEABS QCD Benchmarksuite on a new +machine. More information can be found in the documentation of +the UEABS-QCD BENCHMARKSUITE which is located in in the folder +./PABS/doc/* +or under the web-link + +http://www.prace-ri.eu/UEABS/QCD/QCD_Build_README.txt +http://www.prace-ri.eu/UEABS/QCD/QCD_Run_README.txt + +The suite works with Jube, which will handle the compilation, +the submission and the analysis of the Benchmarksuite. On a new +machine several xml-files has to be added or created. +This guide will give a short and very quick overview about +the different steps. + +The FIRST STEP on a new machine is to add information about the +system to the platform-folder located in: +./PABS/platform +Here, the new platform has to be added to the xml-file "platform.xml" +similar to the already xml-templates: + +.. + + + + +The SECOND STEP is to provide a dummy-submit script which has to +added to a new subdirectory given by: + +./PABS/platform/"NEW-PLATFORM" + +In the THIRD STEP: Go to the home-directory of the UEABS-QCD-Benchmarksuite +located in: +./PABS/applications/QCD/ +Note that the source-files of the kernels are located in "./PABS/applications/QCD/src". +Here, similar to STEP ONE the xml-files: + +compile.xml, execute.xml and analyse.xml + +has to be edit, i.e. new xml-templates with the new platform-information +has to be added. + +In the FOURTH STEP the runs will be setup by creating runs-scripts similar to +"prace-functional-NEW-PLATORM.xml" for a functional test +and +"prace-scaling-NEW-PLATORM.xml" for a scaling run. +Here, several limits of the different codes has to be taken into account, see for +this the section "Limitation" at the end of this quick-userguide. + +In the FIFTH STEP the benchmark can be compiled and ran by using the command: + +perl ../../bench/jube prace-functional-"NEW-PLATFORM".xml + +in the directory: +"./PABS/applications/QCD/". +This will generate a folder "tmp" with subfolder in "./PABS/applications/QCD/" +where the source-file will be compiled and executed. If the compilation or the submission +fails, more information can be found in the subdirectories of "tmp". In any cases +after the generation of the folder "tmp", compilation and submition can be done, +in principle, without Jube. + +In the LAST STEP, the scaling results can be analyzed, by using +perl ../../bench/jube analyse.xml + +LIMITATION: + +The different kernels consists of lattice QCD production codes and have several limitations +in parallelization and lattice volume. Kernel A,C,D and E using a four dimensional +lattice while in case of kernel B a three dimensional lattice is used. All kernels +can be parallelized in all direction. The different lattice sizes and parallelization +has to be declared in the scripts: 'prace-functional-"NEW-PLATFORM".xml' or +'prace-scaling-NEW-PLATORM.xml'. The limitation for the different kernel are given by: + +"pt * px * py * pz = task" + +and additional for the Kernel A, D and E + +" nt / pt modulo 2 = 0 " and " nt => 4 " + +and the same condition for the other pairs +"{nx,px}, {ny,py}, {nz,pz}". Moreover +the lattice extends nt, nx, ny and nx has to be even and larger +than 4. + +####### +####### Please see for further information the Readme-files +####### which are provided under +####### +####### http://www.prace-ri.eu/UEABS/QCD/QCD_Build_README.txt +####### http://www.prace-ri.eu/UEABS/QCD/QCD_Run_README.txt +####### or in +####### ./PABS/doc/* +####### +####### Jacob Finkenrath, 2017 +####### \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/README b/qcd/part_cpu/applications/QCD/README new file mode 100644 index 0000000000000000000000000000000000000000..e5ca95368bcab8545334c73216e2b6619d114301 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/README @@ -0,0 +1,1007 @@ +The lattice quatum chromodynamics (LQCD) benchmark is a compilation +of up to five (three in the moment) LQCD kernels. The kernels are: + +label : kernel_A +short label : KA +kernel origin : Berlin Quantum ChromoDynamics program (BQCD), + DEISA benchmark suite +kernel contact person : Hinnerk Stueben +kernel code status : 2008/08/25 +problem size parameter : KA_N{X,Y,Z,T}, 4D lattice +problem run time parameter : KA_MAXITER, iteration steps +other needed parameter : KA_P{X,Y,Z,T}, distribution of processes in 4D + KA_LIBCOMM, see readme section + KA_LIBCLOVER, see readme section + KA_LIBD, see readme section +notes : + + +label : kernel_B +short label : KB +kernel origin : University of Oulu, Finland + DEISA benchmark suite +kernel contact person : Kari Rummukainen +kernel code status : 2008/08/22 +problem size parameter : KB_NX, x component of the 3D grid + KB_NY, y component of the 3D grid + KB_NZ, z component of the 3D grid +problem run time parameter : KB_MAXITER, iteration steps +other needed parameter : +notes : number of processes needs to be a power of 2 + + +label : kernel_C +short label : KC +kernel origin : privat communication + +kernel contact person : Jacob Finkenrath +kernel code status : 2016/08/24 +problem size parameter : KC_L{X,Y,Z,T}, local size of the 4D grid in {x,y,z,t}-direction +problem run time parameter : +other needed parameter : KC_P{X,Y,Z,T}, number of processes in {x,y,z,t}-direction +notes : + + +label : kernel_D +short label : KD +kernel origin : privat communication + +kernel contact person : Jacob Finkenrath +kernel code status : 2016/08/24 +problem size parameter : K_L{X,Y,Z,T}, size of the 4D grid in {x,y,z,t}-direction +problem run time parameter : +other needed parameter : K_N{X,Y,Z,T}, number of processes in {x,y,z,t}-direction +notes : + + +label : kernel_E +short label : KE +kernel origin : privat communication + +kernel contact person : Stefan Krieg +kernel code status : 2008/11/10 +problem size parameter : K_L{X,Y,Z,T}, size of the 4D grid in {x,y,z,t}-direction +problem run time parameter : +other needed parameter : K_N{X,Y,Z,T}, number of processes in {x,y,z,t}-direction +notes : + + +###################################################################### +kernel_A README + +----------- +BQCD readme +----------- + + + +Note: all base information taken from the +BQCD document; updated with JuBE and new ported platforms + +Subdirectories in src: +~~~~~~~~~~~~~~~~~~~~ + +clover routines for the clover improvement + +comm communication routines + +d multiplication of a vector with "D-slash" + +modules (some) Fortran90 modules + +platform Makefiles and service routines for various platforms + + +General remarks +~~~~~~~~~~~~~~ + +BQCD has been ported to various platforms (see platform/Makefile-*.var): + +# Makefile-altix.var - settings on SGI-Altix 3700 and SGI-Altix 4700 +# Makefile-bgl.var - settings on BlueGene/L +# Makefile-cray.var - settings on Cray T3E and Cray XT4 +# Makefile-hitachi-omp.var - settings on Hitachi SR8000 +# Makefile-hitachi.var - settings on Hitachi SR8000 (pure MPI version) +# Makefile-hp.var - settings for HP-UX Fortran Compiler +# Makefile-ibm.var - settings on IBM +# Makefile-intel.var - settings for Intel Fortran Compiler +# Makefile-nec.var - settings on NEC SX-8 +# Makefile-sun.var - settings on Sun + +The corresponding files + + platform/service-*.F90 + +contain interfaces to service routines / system calls. + +Not all of these files have been used recently. There are kept as a +starting point. + +A "Makefile.var" and a "service.F90" have to be provided in the source +directory that work correctly with your system. +The contents of these files is explained in: + + platform/Makefile-EXPLAINED.var + platform/service-EXPLAINED.var + +"gmake prep-" will create symbolic links accordingly: + +berni1> gmake prep-ibm +gmake prep PLATFORM=ibm +rm -f Makefile.var service.F90 +ln -s platform/Makefile-ibm.var Makefile.var +ln -s platform/service-ibm.F90 service.F90 + + + +Resource requirements +~~~~~~~~~~~~~~~~~~~~ + +The resource requirements are approximately: + +benchmark lattice total memory size of output execution time +--------------------------------------------------------------------------- +MPP 48*48*48*96 497 GByte 4 GByte 268.2 s at 758.52 GFlop/s +SMP 24*24*24*48 37 GByte 0.25 GByte 44.4 s at 608.96 GFlop/s + + +Standart porting +~~~~~~~~~~~~~~~ + +*** make + +The Makefiles use the makro $(MAKE) and the "include" statement. Some +of Makefile-*.var are quite standard, some require GNU-make. + +"make fast" can be used for a parallel "make". + +"make fast" builds the binary "bqcd." + +Without "make fast" one has to enter: + + make Modules + make libs + make bqcd + +JuBE porting +~~~~~~~~~~~ +For Altix: + Change the following lines in the execution file bensh: + the first line: #!/usr/local/bin/perl -w + to + #!/usr/bin/perl -w + the line 1235: $cmd="cp -rp $srcdir/$file $dir/src/"; +to + $cmd="cp -rp $srcdir/* $dir/src/"; + + + + + + +*** ANSI C preprocessor + +The C preprocessor is needed for building the source. The C +preprocessor must be able to handle the string concatenation macro "##". + +Recent versions of the GNU C Proprocesse do not work because they +refuse to process the Fortran90 separator "%". + + +*** Service routines and "stderr" + +Service routines are needed for aborting, measuring CPU-time, to get +arguments from the command line, etc. The corresponding routines have +to be inserted in the file service.F90. + +It is assumed that Fortran unit 0 is pre-connected to stderr. If this +is not the case on your machine you should re-#define STDERR in "defs.h". + +For the time measurements it is important to use a time function with +high resolution in the function "sekunden". + + + +*** Message passing / Communication library + +Originally the communication was programmed with the shmem library on +a Cray T3E. + +Now MPI is mainly used. There is also a single CPU version (that +needs no communication library) and a combination of shmem for the +most time consuming part and MPI. + +See $(LIBCOMM) in platform/Makefile-EXPLAINED.var and "Hints for +optimisation" below. + + +*** OpenMP + +In addition to setting your compiler's OpenMP option you have to add +"-D_OPENMP" in "Makefile.var": + + MYFLAGS = ... -D_OPENMP + + + +Verification +~~~~~~~~~~~ + +*** Random numbers + +Correctness of random numbers can be checked by: + + make the_ranf_test + +The test is done by comparison with reference output. On most +platforms there is no difference. However, on Intel "diff" +usually reports differences in the last digit of the floating point +representation of the random numbers; the integer representations +match exactly, eg: + +< 1 4711 0.5499394951912783 +--- +> 1 4711 0.5499394951912784 + + +*** Argments from the command line + +Try option -V: + +berni1> ./bqcd -V + This is bqcd benchmark2 + input format: 4 + conf info format: 3 + MAX_TEMPER: 50 + real kind: 8 + version of D: 2 + D3: buffer vol: 0 + communication: single_pe + OpenMP + + + +*** BQCD + +To check that the BQCD works correctly execute the following sequence +of commands: + +berni1> cd work +berni1> ../bqcd input.TEST > out.TEST +berni1> grep ' %[fim][atc]' out.TEST > out.tmp +berni1> grep ' %[fim][atc]' out.TEST.reference | diff - out.tmp +18c18 +< %fa -1 1 0.4319366404 1.0173348431 43 407 38 +--- +> %fa -1 1 0.4319366404 1.0173348433 43 407 38 + +The test can be run for any domain decomposition and any number of +threads. In any case result should agree. Floating point numbers +might differ in the last digit as shown above. +(In total 20 lines containing floating point numbers are compared.) + + +*** Check sums + +BQCD writes restart files in the working directory. The extension of +the file containing information on the run is ".info". It contains +check sums of the binary data files (the example was run after the +test run): + +berni1> tail -6 bqcd.000.1.info + >BeginCheckSum + bqcd.000.1.00.u 286125633 24576 + bqcd.000.1.01.u 804770858 24576 + bqcd.000.1.02.u 657813015 24576 + bqcd.000.1.03.u 3802083338 24576 + >EndCheckSum + +These check sums should be identical to check sums calculated by the +"cksum" command: + +berni1> cksum bqcd.000.1.*.u | awk '{print $3, $1, $2}' +bqcd.000.1.00.u 286125633 24576 +bqcd.000.1.01.u 804770858 24576 +bqcd.000.1.02.u 657813015 24576 +bqcd.000.1.03.u 3802083338 24576 + + + +Structure of the input +~~~~~~~~~~~~~~~~~~~~~ + +run 204 names of restart files will contain "run" + can be set to 0 + +lattice 24 24 24 48 lattice size, can e.g. be modified for + weak scaling analysis + +processes 1 2 2 4 number of MPI-proceses per direction + (1 1 1 1 in the pure OpenMP case) + +boundary_conditions_fermions 1 1 1 -1 do not change + +beta 5 do not change +kappa 0.13 do not change +csw 2.3327 do not change + +hmc_test 0 do not change +hmc_model C do not change +hmc_rho 0.1 do not change +hmc_trajectory_length 0.2 do not change +hmc_steps 10 can be lowered -> shorter execution time +hmc_accept_first 1 do not change +hmc_m_scale 3 do not change + +start_configuration cold do not change +start_random default do not change + +mc_steps 1 do not change +mc_total_steps 100 do not change + +solver_rest 1e-99 do not change +solver_maxiter 100 can be lowered -> shorter execution time +solver_ignore_no_convergence 2 do not change (CG will not converge, + the numbers of iterations per call + will be exactly "solver_maxiter") +solver_mre_vectors 7 do not change + + + + + +Hints on optimisation +~~~~~~~~~~~~~~~~~~~~ + +Before starting any optimisation one should find the fastest variant +in the existing code. There are two libraries to look at: $(LIBD) and +$(LIBCOMM). + + + +*** LIBCOMM ("communication", directory: comm) + +There are the following variants: + +lib_single_pe.a: Single CPU version (PE: "processing element"). + +lib_mpi.a: MPI version. + +lib_shmempi.a: shmem for nearest neighbour communication, MPI for the rest. + + +*** Caveat + +Not all combinations of LIBD and LIBCOMM have been implemented. + +The following combinations should work (lib_mpi.a always works): + +LIBD LIBCOMM +-------------------------------------------------- +libd.a lib_single_pe.a lib_mpi.a +libd2.a lib_single_pe.a lib_mpi.a lib_schmempi.a +libd3.a lib_mpi.a +libd21.a lib_single_pe.a lib_mpi.a lib_schmempi.a + + + +Rules for time measurements +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In "Makefile.var" "-DTIMING" must always be set: + + MYFLAGS = -DTIMING ... + +All time measurements (TIMING_START() ... TIMING_STOP()) must be kept. +There is one exception: If you restructure routines d() and d_dag() +it might occur that the current regions of time measurements (which +are per direction) do not make sense. (For example, this would occur +when combining loops from more than one direction.) + +In that case, please report in addition the best measurement obtained +with the existing code. + + + +###################################################################### +kernel_B README + +This is the README file for the SU3_AHiggs application benchmark, +distributed with the DEISA Benchmark Suite: +http://www.deisa.eu/science/benchmarking/ + +Last modified by the DEISA Benchmark Team on 2008-08-22. + + + +----------------- +SU3_AHiggs readme +----------------- + + + +Contents +-------- + +1 General description +2 Code structure +3 Parallelisation +4 Building +5 Execution +6 Verification +7 Input data +8 Output data + + +1 General description +===================== + +SU3_AHiggs is a lattice quantum chromodynamics (QCD) code intended for +computing the conditions of the Early Universe. Instead of the "full QCD", the +code applies an effective field theory, which is valid at high +temperatures. In the effective theory, the lattice is 3D. For this reason, +SU3_AHiggs stresses different parts of the architecture than the conventional +QCD applications using 4D lattices. + +SU3_AHiggs has roots in the MILC code, but it is heavily rewritten by +Prof. Kari Rummukainen (University of Oulu, Finland). The code is written +solely in C and it uses MPI communications. No external libraries are needed +to run the program. + +The directory SU3/src contains several closely related QCD programs: + + * SU3_4D + * SU3_AHiggs + * SU3_Gauge + +In the DEISA benchmarks, only the code SU3_AHiggs is used. If you find errors +in any of the files in the SU3 package, please contact benchmarking@deisa.eu. + + +2 Code structure +================ + +In SU3_AHiggs, the spacetime is discretised and replaced with a 3D cubic +lattice. Every lattice vertex contains a 3 x 3 traceless Hermitian +matrix. From each vertex, in turn, there are six edges to nearest-neighbour +vertices. Edges are 3 x 3 unitary matrices. + +The aim of the SU3_AHiggs computation is to generate lattice configurations +from the microcanonical distribution, which is the statistical equilibrium +state of the system. The program uses heat-bath and over-relaxation algorithms +to update lattice vertices and links. The computation starts from a random +initial configuration. + +The main function of SU3_AHiggs is in the file su3h_n/control.c. After the +initial setup, main calls the function runthis, which in turn calls other +functions in the SU3 package. If the dataset is sufficiently large, most of +the computing time is spent on lattice updates (functions updategauge and +updatehiggs in files su3h_n/updategauge.c and su3h_n/updatehiggs.c). If the +dataset is too small, in turn, the computation becomes communication +bound. MPI routines are not called directly but with customised communication +functions defined in generic/comdefs.h and generic/com_mpi.c. + + +3 Parallelisation +================= + +SU3_AHiggs uses a 3D domain decomposition method for parallelisation. Each MPI +task communicates with six neighbouring tasks only. The communication routines +are defined in the files generic/comdefs.h and generic/com_mpi.c. The most +important routines are: + + * start_get() + + This function starts asynchronous sends and receives required to gather + neighbouring lattice vertices and links. The call graph looks like this: + + start_get() --> start_gather() --> MPI_Irecv(), MPI_Isend() + + * wait_get() + + This function waits for receives to finish, ensuring that the data has + actually arrived. The call graph looks like this: + + wait_get() --> wait_gather() --> MPI_Wait() + +With a 32^3 lattice, the program performs well up to 256 processes. With a +256^3 lattice, the speedup is almost linear with the number of processes. The +highest processor number tested so far is 2048. The lattice size and the +number of iterations are controlled by four user-adjustable parameters. + + +4 Building +========== + +To build SU3_AHiggs with the JUBE tool on a new architecture (NEWARCH), do the +following steps: + + 1) Create a new top-level XML file for the new architecture + (bench-NEWARCH.xml). In this task, you can use the already existing + files as a starting point: bench-Cray-XT4-Louhi.xml, + bench-IBM-SP4-Jump.xml, and bench-SGI-Altix-HLRB2.xml. Normally you have + to change the values of $nodes and $taskspernode only. + + 2) Edit compile.xml: Create a new section , where + NEWARCH is the same as in the file + DEISA_BENCH/platform/platform.xml. Substitute values in the new compile + section with those proper for the new architecture. Normally you need to + change #CFLAGS# and #LDFLAGS#. Possibly you want to change #CC# and + #MPI_CC# also. + + 3) Run the compile step within the benchmark "test": Edit bench-NEWARCH.xml + and make sure that you have: + + + + ... + + + Then run: perl ../../bench/jube -debug bench-NEWARCH.xml + +If the compile step fails, go to the directory where JUBE has run the compile +step: + + tmp/SU3_NEWARCH_test_i000001/.../src + +Then try to run the command make manually. Analyze the error and try to fix it +modifying the file Makefile.defs. After the problem is solved, edit the file +compile.xml accordingly. If you cannot solve the problem just by editing +compile.xml, please contact benchmarking@deisa.eu. + + +5 Execution +=========== + +To run SU3_AHiggs with the JUBE tool, do the following steps: + + 1) Before running the benchmarks you need an execute script template, such + as: + + DEISA_BENCH/platform/Cray-XT4-Louhi/cray_qsub.job.in + + 2) Edit execute.xml: Create a new section , and + match the values in the new section with the execute script template. + + 3) Run a benchmark: Select a benchmark by setting active="1" in the file + bench-NEWARCH.xml. Then run: perl ../../bench/jube -submit + bench-NEWARCH.xml + +To run SU3_AHiggs manually (without JUBE), do the following steps: + + 1) Copy the SU3_AHiggs executable to a directory that is accessible from + compute nodes. The name of the SU3_AHiggs executable is: + + src/su3h_n/su3_ahiggs + + 2) Copy the input files beta, parameter, and status to the same + directory. In the directory input, there are several sets of input files + available: + + input/lat_256/* (256^3 lattice, 100 iterations) + input/lat_32/* (32^3 lattice, 10000 iterations) + input/test/* (32^3 lattice, 100 iterations) + + 3) Start the program with a MPI launcher available in your system, for + example: + + aprun -n 8 ./su3_ahiggs + + The test benchmark takes approximately 10 seconds with 8 processor + cores. Other benchmarks run longer: approximately 1 minute with 1024 + cores. + +Important: The number of tasks in su3_ahiggs must be a power of 2. Otherwise +the program cannot layout the lattice, and the execution stops. + + +6 Verification +============== + +JUBE verifies benchmark results automatically as part of the result analysis +step. In SU3_AHiggs, the verification cannot be done directly by comparing +benchmark results with some reference results. The reason to this is that the +results are very sensitive to compiler optimizations and the number of MPI +tasks as well. This can make results to appear very different if compared with +the reference results. Everything can still be all right, as long as the +results are statistically the same. + +Therefore SU3_AHiggs uses a statistical comparison test to verify benchmark +results (Student's t-test). Significance level is chosen to be 1e-4 (correct +results are rejected once every 10000 times). First 50 iterations are not +included in the comparison. The reference results are found at: + + reference/lat_256/higgs.out (256^3 lattice, 100 iterations) + reference/lat_32/higgs.out (32^3 lattice, 10000 iterations) + reference/test/higgs.out (32^3 lattice, 100 iterations) + +These files contain the Higgs field at each iteration for a given lattice +size. + +To verify benchmark results manually (without JUBE), do the following steps: + + 1) Copy the executable src/aa/aa to the directory SU3/run. + + 2) Run the following command in the directory SU3: + + perl run/check_results_su3.pl output.xml stdout.log stderr.log \ + $RUNDIR reference/lat_256 + + The environment variable $RUNDIR should point to the directory where + SU3_AHiggs has been executed. + + 3) If the benchmark results are correct, the file output.xml includes the + following lines: + + + + + If not, the same lines look like this: + + + + + +7 Input data +============ + +Input data for SU3_AHiggs consist of three short ASCII files containing +simulation parameters related to temperature, lattice size, iterations, etc. + +For example, the files related to the test benchmark look like this: + +input/test/beta: + + betag 12 + x 0.06 + y 0.69025056 + +input/test/parameters: + + nx 32 + ny 32 + nz 32 + micro steps 4 + n_measurement 1 + n_correlation 10000 + w_correlation 100000 + n_save -1000 + blocking levels 1 + level 0 1 + level 1 1 + +input/test/status: + + restart 0 + n_iteration 100 + n_thermal 0 + seed 479817384 + run status + iteration + time: gauge + time: higgs + time: rest + +It is easy to create new datasets by changing the lattice size (variables nx, +ny, and nz), number of iterations (n_iteration), and seed number for the +random number generator (seed). The duration of a simulation is roughly +proportional to: + + nx * ny * nz * n_iteration + +SU3_AHiggs has currently three datasets: + + test 32^3 lattice, 100 iterations + small 32^3 lattice, 10000 iterations (artificial dataset) + large 256^3 lattice, 100 iterations (real research dataset) + +The test dataset is designed to help porting to new architectures. The small +dataset, in turn, is designed for benchmarking purposes. With it, benchmark +timings depend strongly on the interconnect speed. + + +8 Output data +============= + +During the benchmarks, SU3_AHiggs writes out its result to the following files: + + correl + measure + status + +Note that the file named status is both input and output file; SU3_AHiggs +modifies it during the computation. The file measure is a binary file that +contains simulation results at each iteration. Its contents can be read with +the tool named aa available in the directory src/aa. + +The benchmark timings are written to the standard output. JUBE reads them +automatically as part of the analysis step. To get benchmark timings manually, +grep for "total time in seconds" in the standard output. + + + +###################################################################### +kernel_C README + + +This document is short guide to get started and run the speed tests. For +more detailed information see the README.extended. + + +PROGRAMS + +The benchmark programs are provided in source form and must be +compiled by the user on the machine that is to be tested. + +In addition the openQCD-1.4 package is needed. A tar-file of the +source code can be obtained from + +http://luscher.web.cern.ch/luscher/openQCD/ + +and should be extracted in the same directory level as this package. + +PROGRAM FEATURES + +All programs parallelize in 0,1,2,3 or 4 dimensions, depending on what is +specified at compilation time. They are highly optimized for machines with +current Intel or AMD processors, but will run correctly on any system that +complies with the ISO C89 (formerly ANSI C) and the MPI 1.2 standards. + +For the purpose of testing and code development, the programs can also +be run on a desktop or laptop computer. All what is needed for this is +a compliant C compiler and a local MPI installation such as Open MPI. + + +DOCUMENTATION + +The simulation program has a modular form, with strict prototyping and a +minimal use of external variables. Each program file contains a small number +of externally accessible functions whose functionality is described at the top +of the file. + +The data layout is explained in various README files and detailed instructions +are given on how to run the main programs. A set of further documentation +files are included in the doc directory, where the normalization conventions, +the chosen algorithms and other important program elements are described. + + +COMPILATION + +The compilation of the programs requires an ISO C89 compliant compiler and a +compatible MPI installation that complies with the MPI standard 1.2 (or later). + +In the main and devel directories, a GNU-style Makefile is included which +compiles and links the programs (just type "make" to compile everything; "make +clean" removes the files generated by "make"). The compiler options can be set +by editing the CFLAGS line in the Makefiles. + +The Makefiles assume that the following environment variables are set: + + GCC GNU C compiler command [Example: /usr/bin/gcc]. + + MPI_HOME MPI home directory [Example: /usr/lib64/mpi/gcc/openmpi]. + The mpicc command used is the one in $MPI_HOME/mpicc and + the MPI libraries are expected in $MPI_HOME/lib. + + MPI_INCLUDE Directory where the mpi.h file is to be found. + +All programs are then compiled using the $MPI_HOME/bin/mpicc command. The +compiler options that can be set in the CFLAGS line depend on which C compiler +the mpicc command invokes (the GCC compiler command is only used to resolve +the dependencies on the include files). + + +SSE/AVX ACCELERATION + +Current Intel and AMD processors are able to perform arithmetic operations on +short vectors of floating-point numbers in just one or two machine cycles, +using SSE and/or AVX instructions. The arithmetic performed by these +instructions fully complies with the IEEE 754 standard. + +Many programs in the module directories include SSE and AVX inline-assembly +code. On 64bit systems, and if the GNU or Intel C compiler is used, the code +can be activated by setting the compiler flags -Dx64 and -DAVX, respectively. +In addition, SSE prefetch instructions will be used if one of the following +options is specified: + + -DP4 Assume that prefetch instructions fetch 128 bytes at a time + (Pentium 4 and related Xeons). + + -DPM Assume that prefetch instructions fetch 64 bytes at a time + (Athlon, Opteron, Pentium M, Core, Core 2 and related Xeons). + + -DP3 Assume that prefetch instructions fetch 32 bytes at a time + (Pentium III). + +These options have an effect only if -Dx64 or -DAVX is set. The option +-DAVX implies -Dx64. + +On recent x86-64 machines with AMD Opteron or Intel Xeon processors, for +example, the recommended compiler flags are + + -std=c89 -O -mno-avx -DAVX -DPM + +For older machines that do not support the AVX instruction set, the +recommended flags are + + -std=c89 -O -mno-avx -Dx64 -DPM + +More aggressive optimization levels such as -O2 and -O3 tend to have little +effect on the execution speed of the programs, but the risk of generating +wrong code is higher. + +AVX instructions and the option -mno-avx may not be known to old versions +of the compilers, in which case one is limited to SSE accelerations with +option string -std=c89 -O -Dx64 -DPM. + + +DEBUGGING FLAGS + +For troubleshooting and parameter tuning, it may helpful to switch on some +debugging flags at compilation time. The simulation program then prints a +detailed report to the log file on the progress made in specified subprogram. + +The available flags are: + +-DCGNE_DBG CGNE solver. + +-DFGCR_DBG GCR solver. + +-FGCR4VD_DBG GCR solver for the little Dirac equation. + +-DMSCG_DBG MSCG solver. + +-DDFL_MODES_DBG Deflation subspace generation. + +-DMDINT_DBG Integration of the molecular-dynamics equations. + +-DRWRAT_DBG Computation of the rational function reweighting + factor. + + +RUNNING A SIMULATION + +The simulation programs reside in the directory "main". For each program, +there is a README file in this directory which describes the program +functionality and its parameters. + +Running a simulation for the first time requires its parameters to be chosen, +which tends to be a non-trivial task. The syntax of the input parameter files +and the meaning of the various parameters is described in some detail in +main/README.infiles and doc/parms.pdf. Examples of valid parameter files are +contained in the directory main/examples. + + +EXPORTED FIELD FORMAT + +The field configurations generated in the course of a simulation are written +to disk in a machine-independent format (see modules/misc/archive.c). +Independently of the machine endianness, the fields are written in little +endian format. A byte-reordering is therefore not required when machines with +different endianness are used for the simulation and the physics analysis. + + +AUTHORS + +The initial release of the openQCD package was written by Martin Lüscher and +Stefan Schaefer. Support for Schrödinger functional boundary conditions was +added by John Bulava. Several modules were taken over from the DD-HMC program +tree, which includes contributions from Luigi Del Debbio, Leonardo Giusti, +Björn Leder and Filippo Palombi. + + +ACKNOWLEDGEMENTS + +In the course of the development of the openQCD code, many people suggested +corrections and improvements or tested preliminary versions of the programs. +The authors are particularly grateful to Isabel Campos, Dalibor Djukanovic, +Georg Engel, Leonardo Giusti, Björn Leder, Carlos Pena and Hubert Simma for +their communications and help. + + +LICENSE + +The software may be used under the terms of the GNU General Public Licence +(GPL). + + +BUG REPORTS + +If a bug is discovered, please send a report to . + + +ALTERNATIVE PACKAGES AND COMPLEMENTARY PROGRAMS + +There is a publicly available BG/Q version of openQCD that takes advantage of +the machine-specific features of IBM BlueGene/Q computers. The version is +available at . + +The openQCD programs currently do not support reweighting in the quark +masses, but a module providing this functionality can be downloaded from +. + +Previously generated gauge-field configurations are often used as initial +configuration for a new run. If the old and new lattices or boundary +conditions are not the same, the old configuration may however need to be +adapted, using a field conversion tool such as the one available at +, before the new run is started. + +###################################################################### +kernel_D README + +Important compiler defines XXX are (-DXXX) +MPI -> switch on parallelisation +PARALLELXYZT -> 4-dimensional parallelisation +PARALLELXYT -> 3-dim +PARALLELXT -> 2-dim +PARALLELT -> 1-dim +SSE2 -> SSE2 inline assembly (to be used with one of the two following) +P4 -> pentium 4 +OPTERON -> opteron +_GAUGE_COPY -> non-strided memory access for gauge fields, but more memory required +BGL -> Blue Gene /L +BGP -> Blue Gene /P, to be used on top of BGL + +If none of them are used, you will get a serial version of the program. + +The local lattice size in the case of the one dimensional +prallelisation is controlled by the parameters in the file +benchmark.input: + +T = 32 +L = 16 + +which will give a 32 x 16^3 global lattice. + +NrXProcs = 2 + +needs only to be set in case of a parallel compilation and sets +the number of processes in x-direction. The same holds for NrYProcs and NrZProcs. +The number of processes in +t-direction is computed from NrX|Y|ZProcs and the total number of processes. +You should only take care that all this fits with the lattice size. + +the package size of the data that are send and recieved is +192 * (1/2) * L^3 Byte in case of the one dimensional parallelisation. +In case of the two dimensional parallelisation it is +192 * (1/2) ((L*L*L/N_PROC_X)+(T*L*L)) Byte. + +A run of the benchmark takes about one minute. + +The out-put of the program is something like this: (T=2,L=16) + +The number of processes is 12 +The local lattice size is 2 x 16 ^3 +total time 4.681349e+00 sec, Variance of the time 6.314982e-03 sec + + (297 Mflops [64 bit arithmetic]) + +communication switched off + (577 Mflops [64 bit arithmetic]) + +The size of the package is 393216 Byte +The bandwidth is 84.49 + 84.49 MB/sec + + +If you use the serial version of course the part depending on the +parallel setup will be missing. + + +Compilation examples (you need a c-compiler with c99 standard, otherwise you may need to define inline, restrict etc. to nothing): + +in general (gcc) +gcc -std=c99 -I. -I./ -I.. -o benchmark -D_GAUGE_COPY -O Hopping_Matrix.c mpi_init.c geometry_eo.c test/check_xchange.c test/check_geometry.c boundary.c start.c ranlxd.c init_gauge_field.c init_geometry_indices.c init_moment_field.c init_spinor_field.c read_input.c benchmark.c update_backward_gauge.c D_psi.c ranlxs.c -lm + +gcc and OPTERON (64 Bit architecture): +gcc -std=c99 -I. -I./ -I.. -o benchmark -DOPTERON -DSSE2 -mfpmath=387 -fomit-frame-pointer -ffloat-store -D_GAUGE_COPY -O Hopping_Matrix.c mpi_init.c geometry_eo.c test/check_xchange.c test/check_geometry.c boundary.c start.c ranlxd.c init_gauge_field.c init_geometry_indices.c init_moment_field.c init_spinor_field.c read_input.c benchmark.c update_backward_gauge.c D_psi.c ranlxs.c -lm + +gcc and pentium4: +gcc -std=c99 -I. -I./ -I.. -o benchmark -DSSE2 -DP4 -march=pentium4 -malign-double -fomit-frame-pointer -ffloat-store -D_GAUGE_COPY -O Hopping_Matrix.c mpi_init.c geometry_eo.c test/check_xchange.c test/check_geometry.c boundary.c start.c ranlxd.c init_gauge_field.c init_geometry_indices.c init_moment_field.c init_spinor_field.c read_input.c benchmark.c update_backward_gauge.c D_psi.c ranlxs.c -lm + +mpicc (gcc) general, four dimensional parallelisation: +mpicc -std=c99 -I. -I./ -I.. -o benchmark -O3 -DMPI -DPARALLELXYZT -D_GAUGE_COPY -O Hopping_Matrix.c Hopping_Matrix_nocom.c xchange_deri.c xchange_field.c xchange_gauge.c xchange_halffield.c xchange_lexicfield.c mpi_init.c geometry_eo.c test/check_xchange.c test/check_geometry.c boundary.c start.c ranlxd.c init_gauge_field.c init_geometry_indices.c init_moment_field.c init_spinor_field.c read_input.c benchmark.c update_backward_gauge.c D_psi.c ranlxs.c init_dirac_halfspinor.c -lm + + +###################################################################### +kernel_E README + +none \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/analyse.xml b/qcd/part_cpu/applications/QCD/analyse.xml new file mode 100644 index 0000000000000000000000000000000000000000..e3d204ce271e43f7ca6b105360161edfffcf5c6b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/analyse.xml @@ -0,0 +1,97 @@ + + + + (cd $outdir; bash collectData.sh) + + + + + + + + + + (cd $outdir; bash collectData.sh) + + + + + + + + (cd $outdir; bash collectData.sh) + + + + + + + + (cd $outdir; bash collectData.sh) + + + + + + + + (cd $outdir; bash collectData.sh) + + + + + + + + + + (cd $outdir; bash collectData.sh) + + + + + + + + + + (cd $outdir; bash collectData.sh) + + + + + + + + + + (cd $outdir; bash collectData.sh) + + + + + + + + + + (cd $outdir; bash collectData.sh) + + + + + + + + + + (cd $outdir; bash collectData.sh) + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/compile.xml b/qcd/part_cpu/applications/QCD/compile.xml new file mode 100644 index 0000000000000000000000000000000000000000..045ca1891c60d323ca71994f1e12c4fd02f92506 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/compile.xml @@ -0,0 +1,2071 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ( cd kernel_A; sh fixLouhiPP.sh; cd ..; source /opt/modules/default/init/sh; `index('$CRAYPAT','on')==0 ? 'module load xt-craypat;' : ' '` make `index('$CRAYPAT','on')==0 ? '; pat_build -v -u -g $CRAYPAT_GROUP -o ${execname}+pat $execname; cp ${execname}+pat $execname' : ' '`) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ( cd kernel_A; sh fixLouhiPP.sh; cd ..; source /opt/modules/default/init/sh; `index('$CRAYPAT','on')==0 ? 'module load xt-craypat;' : ' '` make `index('$CRAYPAT','on')==0 ? '; pat_build -v -u -g $CRAYPAT_GROUP -o ${execname}+pat $execname; cp ${execname}+pat $execname' : ' '`) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ( cd kernel_A; sh fixLouhiPP.sh; cd ..; source /opt/modules/default/init/sh; `index('$CRAYPAT','on')==0 ? 'module load xt-craypat;' : ' '` make `index('$CRAYPAT','on')==0 ? '; pat_build -v -u -g $CRAYPAT_GROUP -o ${execname}+pat $execname; cp ${execname}+pat $execname' : ' '`) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (`index('$PAPI','on')==0 ? 'module load papi;' : ' '` `index('$IHPCT_HWC','on')==0 || index('$IHPCT_MPITR','on')==0 ? 'module load hpct;' : ' '` make) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (make) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (make) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (cd kernel_A; sh fixLouhiPP.sh; cd ..; make) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (`index('$IHPCT_HWC','on')==0 || index('$IHPCT_MPITR','on')==0 ? 'module load ihpct;' : ' '` make) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (cd kernel_A; sh fixLouhiPP.sh; cd ..; make) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (cd kernel_A; sh fixLouhiPP.sh; cd ..; make) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (cd kernel_A; sh fixLouhiPP.sh; cd ..; make) + + + + diff --git a/qcd/part_cpu/applications/QCD/execute.xml b/qcd/part_cpu/applications/QCD/execute.xml new file mode 100644 index 0000000000000000000000000000000000000000..deae4afe9d42603614e9d4e928d3fa823ac46d9e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/execute.xml @@ -0,0 +1,450 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + llsubmit ibm_llsubmit.job + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + qsub -q prace cray_qsub.job + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + qsub cray_PBSsubmit.job + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + qsub intel_PBSsubmit.job + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + msub intel_PBSsubmit.job + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + llsubmit ibm_llsubmit.job + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + llsubmit ibm_llsubmit.job + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + llsubmit ibm_llsubmit.job + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + llsubmit ibm_llsubmit.job + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + sbatch intel_SLURMsubmit.job + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + qsub intel_PBSsubmit.job + + + + + + diff --git a/qcd/part_cpu/applications/QCD/input/kernel_A.input b/qcd/part_cpu/applications/QCD/input/kernel_A.input new file mode 100644 index 0000000000000000000000000000000000000000..9a8e24d12eae37c64bc6b35e6e1cb316a54ca851 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/input/kernel_A.input @@ -0,0 +1,29 @@ +run 0 + +lattice 2 2 2 2 +processes 1 1 1 1 +boundary_conditions_fermions 1 1 1 -1 + +beta 5 +kappa 0.13 +csw 2.3327 +h 0 + +hmc_test 0 +hmc_model C +hmc_rho 0.1 +hmc_trajectory_length 0.2 +hmc_steps 10 +hmc_accept_first 1 +hmc_m_scale 3 + +start_configuration cold +start_random default + +mc_steps 1 +mc_total_steps 100 + +solver_rest 1e-99 +solver_maxiter 50 +solver_ignore_no_convergence 2 +solver_mre_vectors 7 diff --git a/qcd/part_cpu/applications/QCD/input/kernel_A.input.in b/qcd/part_cpu/applications/QCD/input/kernel_A.input.in new file mode 100644 index 0000000000000000000000000000000000000000..b5ef4d8645bb9e63ef1cf08e31fe14dab6476069 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/input/kernel_A.input.in @@ -0,0 +1,29 @@ +run 0 + +lattice #KA_LATTICE# +processes #KA_PROCESSES# +boundary_conditions_fermions 1 1 1 -1 + +beta 5 +kappa 0.13 +csw 2.3327 +h 0 + +hmc_test 0 +hmc_model C +hmc_rho 0.1 +hmc_trajectory_length 0.2 +hmc_steps 10 +hmc_accept_first 1 +hmc_m_scale 3 + +start_configuration cold +start_random default + +mc_steps 1 +mc_total_steps 100 + +solver_rest 1e-99 +solver_maxiter #KA_MAXITER# +solver_ignore_no_convergence 2 +solver_mre_vectors 7 diff --git a/qcd/part_cpu/applications/QCD/input/kernel_B.input.beta.in b/qcd/part_cpu/applications/QCD/input/kernel_B.input.beta.in new file mode 100644 index 0000000000000000000000000000000000000000..084fefd6335861e219b2a507ebfb35672efc9fe1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/input/kernel_B.input.beta.in @@ -0,0 +1,3 @@ +betag 12 +x 0.06 +y 0.69025056 diff --git a/qcd/part_cpu/applications/QCD/input/kernel_B.input.parameters.in b/qcd/part_cpu/applications/QCD/input/kernel_B.input.parameters.in new file mode 100644 index 0000000000000000000000000000000000000000..9e892b7d77d7a02a143c855e464b52ace48c2350 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/input/kernel_B.input.parameters.in @@ -0,0 +1,11 @@ +nx #KB_NX# +ny #KB_NY# +nz #KB_NZ# +micro steps 4 +n_measurement 1 +n_correlation 10000 +w_correlation 100000 +n_save -1000 +blocking levels 1 +level 0 1 +level 1 1 diff --git a/qcd/part_cpu/applications/QCD/input/kernel_B.input.status.in b/qcd/part_cpu/applications/QCD/input/kernel_B.input.status.in new file mode 100644 index 0000000000000000000000000000000000000000..9234430ec16df17bf40e6eb2c9646d2075b70c7f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/input/kernel_B.input.status.in @@ -0,0 +1,9 @@ +restart 0 +n_iteration #KB_MAXITER# +n_thermal 0 +seed 989357013 +run status +iteration +time: gauge +time: higgs +time: rest diff --git a/qcd/part_cpu/applications/QCD/input/kernel_D.input.in b/qcd/part_cpu/applications/QCD/input/kernel_D.input.in new file mode 100644 index 0000000000000000000000000000000000000000..585b4875301ed2e9f00d8e232ea2b39e451a006b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/input/kernel_D.input.in @@ -0,0 +1,7 @@ +L=#KD_L# +T=#KD_T# + +# no of processors per direction, time direction chosen automatically +NrXProcs = #KD_NP_X# +NrYProcs = #KD_NP_Y# +NrZProcs = #KD_NP_Z# diff --git a/qcd/part_cpu/applications/QCD/input/kernel_E.input.in b/qcd/part_cpu/applications/QCD/input/kernel_E.input.in new file mode 100644 index 0000000000000000000000000000000000000000..461ca9e2b921991e17fd4cbf83026041d917c179 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/input/kernel_E.input.in @@ -0,0 +1,15 @@ +#lattice +nx #KE_NX# +ny #KE_NY# +nz #KE_NZ# +nt #KE_NT# +totnodes #KE_PROCS# + +#wilson +mass_wilson #KE_WILSON_MASS# + +#max iterations +max_cg_iters #KE_MAXITER# + +#etc +verbose 1 diff --git a/qcd/part_cpu/applications/QCD/patterns-gprof-qcd.xml b/qcd/part_cpu/applications/QCD/patterns-gprof-qcd.xml new file mode 100644 index 0000000000000000000000000000000000000000..e4f83d1f3925fb72b10ac27384b05832ca771b8e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/patterns-gprof-qcd.xml @@ -0,0 +1,20 @@ + + + JuBE: gprof: proc 1:\s*$patwrd\s*$patnfp + JuBE: gprof: proc 1:\s*$patnwrd\s*$patfp + + JuBE: gprof: proc 2:\s*$patwrd\s*$patnfp + JuBE: gprof: proc 2:\s*$patnwrd\s*$patfp + + JuBE: gprof: proc 3:\s*$patwrd\s*$patnfp + JuBE: gprof: proc 3:\s*$patnwrd\s*$patfp + + JuBE: gprof: proc 4:\s*$patwrd\s*$patnfp + JuBE: gprof: proc 4:\s*$patnwrd\s*$patfp + + JuBE: gprof: proc 5:\s*$patwrd\s*$patnfp + JuBE: gprof: proc 5:\s*$patnwrd\s*$patfp + + JuBE: gprof: proc $patint:\s+$patwrd\s+$patfp + + \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/patterns-ihpct-qcd.xml b/qcd/part_cpu/applications/QCD/patterns-ihpct-qcd.xml new file mode 100644 index 0000000000000000000000000000000000000000..9b0d9be02118c3150871797ecba74dead0efa088 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/patterns-ihpct-qcd.xml @@ -0,0 +1,31 @@ + + + IHPCT: libHPM: in section kernel_A: PM_FPU_FLOP:\s*$patint + IHPCT: libHPM: in section kernel_B: PM_FPU_FLOP:\s*$patint + IHPCT: libHPM: in section kernel_C: PM_FPU_FLOP:\s*$patint + IHPCT: libHPM: in section kernel_D: PM_FPU_FLOP:\s*$patint + IHPCT: libHPM: in section kernel_E: PM_FPU_FLOP:\s*$patint + + IHPCT: libHPM: in section kernel_A: \% of peak performance:\s*$patint + IHPCT: libHPM: in section kernel_B: \% of peak performance:\s*$patint + IHPCT: libHPM: in section kernel_C: \% of peak performance:\s*$patint + IHPCT: libHPM: in section kernel_D: \% of peak performance:\s*$patint + IHPCT: libHPM: in section kernel_E: \% of peak performance:\s*$patint + + IHPCT: libHPM: in section kernel_A: number of load/stores per L1 miss:\s*$patint + IHPCT: libHPM: in section kernel_B: number of load/stores per L1 miss:\s*$patint + IHPCT: libHPM: in section kernel_C: number of load/stores per L1 miss:\s*$patint + IHPCT: libHPM: in section kernel_D: number of load/stores per L1 miss:\s*$patint + IHPCT: libHPM: in section kernel_E: number of load/stores per L1 miss:\s*$patint + + IHPCT: libHPM: in section kernel_A: $patwrd:\s+$patint + IHPCT: libHPM: in section kernel_B: $patwrd:\s+$patint + IHPCT: libHPM: in section kernel_C: $patwrd:\s+$patint + IHPCT: libHPM: in section kernel_D: $patwrd:\s+$patint + IHPCT: libHPM: in section kernel_E: $patwrd:\s+$patint + + IHPCT: libHPM: in section QCD: $patwrd:\s+$patint + + IHPCT: MPITracer: median communication time =\s*$patfp\s*sec + + diff --git a/qcd/part_cpu/applications/QCD/patterns-jube-qcd.xml b/qcd/part_cpu/applications/QCD/patterns-jube-qcd.xml new file mode 100644 index 0000000000000000000000000000000000000000..80081f279e56ed788d6fedb72ebcb21021f214d1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/patterns-jube-qcd.xml @@ -0,0 +1,45 @@ + + + JuBE: total mean run time: $patfp + JuBE: total mean run time: $patfp + + JuBE global mean timing statistics:\s*kernel_A\s*$patnfp\s*$patnfp\s*$patnfp\s*$patfp + JuBE global mean timing statistics:\s*kernel_B\s*$patnfp\s*$patnfp\s*$patnfp\s*$patfp + JuBE global mean timing statistics:\s*kernel_C\s*$patnfp\s*$patnfp\s*$patnfp\s*$patfp + JuBE global mean timing statistics:\s*kernel_D\s*$patnfp\s*$patnfp\s*$patnfp\s*$patfp + JuBE global mean timing statistics:\s*kernel_E\s*$patnfp\s*$patnfp\s*$patnfp\s*$patfp + + JuBE global mean timing statistics:\s*kernel_A\s*$patfp\s*$patnfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_B\s*$patfp\s*$patnfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_C\s*$patfp\s*$patnfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_D\s*$patfp\s*$patnfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_E\s*$patfp\s*$patnfp\s*$patnfp\s*$patnfp + + JuBE global mean timing statistics:\s*kernel_A\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_B\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_C\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_D\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_E\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp + + JuBE global mean timing statistics:\s*kernel_A\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_B\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_C\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_D\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_E\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp + + JuBE global mean timing statistics:\s*kernel_A\s*$patnfp\s*$patnfp\s*$patfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_B\s*$patnfp\s*$patnfp\s*$patfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_C\s*$patnfp\s*$patnfp\s*$patfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_D\s*$patnfp\s*$patnfp\s*$patfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_E\s*$patnfp\s*$patnfp\s*$patfp\s*$patnfp + + + JuBE: total max mem:\s*$patint + + JuBE: max mem for kernel_A:\s*$patint + JuBE: max mem for kernel_B:\s*$patint + JuBE: max mem for kernel_C:\s*$patint + JuBE: max mem for kernel_D:\s*$patint + JuBE: max mem for kernel_E:\s*$patint + + \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/patterns-papi-qcd.xml b/qcd/part_cpu/applications/QCD/patterns-papi-qcd.xml new file mode 100644 index 0000000000000000000000000000000000000000..caf68d1e3ac62256b676feb1501593a6bfbdb8a6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/patterns-papi-qcd.xml @@ -0,0 +1,17 @@ + + + JuBE: PAPI counter for kernel_A: PAPI_TOT_CYC:\s*$patint + JuBE: PAPI counter for kernel_B: PAPI_TOT_CYC:\s*$patint + JuBE: PAPI counter for kernel_C: PAPI_TOT_CYC:\s*$patint + JuBE: PAPI counter for kernel_D: PAPI_TOT_CYC:\s*$patint + JuBE: PAPI counter for kernel_E: PAPI_TOT_CYC:\s*$patint + + JuBE: PAPI counter for kernel_A: PAPI_FP_OPS:\s*$patint + JuBE: PAPI counter for kernel_B: PAPI_FP_OPS:\s*$patint + JuBE: PAPI counter for kernel_C: PAPI_FP_OPS:\s*$patint + JuBE: PAPI counter for kernel_D: PAPI_FP_OPS:\s*$patint + JuBE: PAPI counter for kernel_E: PAPI_FP_OPS:\s*$patint + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-cartesius.xml b/qcd/part_cpu/applications/QCD/prace-functional-cartesius.xml new file mode 100644 index 0000000000000000000000000000000000000000..bdd7b2cafe16fa5351e7f62f010f744948b84e10 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-cartesius.xml @@ -0,0 +1,188 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-cartesius_24.xml b/qcd/part_cpu/applications/QCD/prace-functional-cartesius_24.xml new file mode 100644 index 0000000000000000000000000000000000000000..f7546843fd11e6de8d12fcfabc9214a01301037c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-cartesius_24.xml @@ -0,0 +1,188 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-hector.xml b/qcd/part_cpu/applications/QCD/prace-functional-hector.xml new file mode 100644 index 0000000000000000000000000000000000000000..e600a4b7adf4db00a4366d4adf9f69a550778f0a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-hector.xml @@ -0,0 +1,182 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-hermit.xml b/qcd/part_cpu/applications/QCD/prace-functional-hermit.xml new file mode 100644 index 0000000000000000000000000000000000000000..80fc4c95f3310dc60b8e409f0bd9f0a2b326d375 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-hermit.xml @@ -0,0 +1,186 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-huygens.xml b/qcd/part_cpu/applications/QCD/prace-functional-huygens.xml new file mode 100644 index 0000000000000000000000000000000000000000..b9228ca77c72197f8317b3a22f6ecbdba831e3e4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-huygens.xml @@ -0,0 +1,197 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-jugene.xml b/qcd/part_cpu/applications/QCD/prace-functional-jugene.xml new file mode 100644 index 0000000000000000000000000000000000000000..489fe4fb1ba0c007dfaf509918051739af4ea8db --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-jugene.xml @@ -0,0 +1,194 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-juqueen.xml b/qcd/part_cpu/applications/QCD/prace-functional-juqueen.xml new file mode 100644 index 0000000000000000000000000000000000000000..68af37a2c7eacd6d68adab42e56870cdeaa1406b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-juqueen.xml @@ -0,0 +1,198 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-juropa.xml b/qcd/part_cpu/applications/QCD/prace-functional-juropa.xml new file mode 100644 index 0000000000000000000000000000000000000000..0b83b7abb4a504f6da2eb7cbe08e53d32bfb9085 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-juropa.xml @@ -0,0 +1,188 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-louhi.xml b/qcd/part_cpu/applications/QCD/prace-functional-louhi.xml new file mode 100644 index 0000000000000000000000000000000000000000..8544acda0cc3ad228d6546257290205998c82ec7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-louhi.xml @@ -0,0 +1,184 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-marconi.xml b/qcd/part_cpu/applications/QCD/prace-functional-marconi.xml new file mode 100644 index 0000000000000000000000000000000000000000..0557883ef7a1f527fcb10b0f9eefc9dd67222771 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-marconi.xml @@ -0,0 +1,188 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-marconi2.xml b/qcd/part_cpu/applications/QCD/prace-functional-marconi2.xml new file mode 100644 index 0000000000000000000000000000000000000000..19b2d1f8b82bf1585eca082c89626c31c52d8664 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-marconi2.xml @@ -0,0 +1,188 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-supermuc.xml b/qcd/part_cpu/applications/QCD/prace-functional-supermuc.xml new file mode 100644 index 0000000000000000000000000000000000000000..b3aedd187111f9acf057d0fe77e951fff67cc656 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-supermuc.xml @@ -0,0 +1,189 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-cartesius.xml b/qcd/part_cpu/applications/QCD/prace-scaling-cartesius.xml new file mode 100644 index 0000000000000000000000000000000000000000..1a12d10dfcb2a9c6b59660711767e9d8d2b2543a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-cartesius.xml @@ -0,0 +1,223 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-cartesius_24.xml b/qcd/part_cpu/applications/QCD/prace-scaling-cartesius_24.xml new file mode 100644 index 0000000000000000000000000000000000000000..a6f7c403a116a204d4a4968b01ce0c6b78df6d1e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-cartesius_24.xml @@ -0,0 +1,192 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-curie.xml b/qcd/part_cpu/applications/QCD/prace-scaling-curie.xml new file mode 100644 index 0000000000000000000000000000000000000000..ff6563a561f25773b5a8a3e4c113bd518c9699d2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-curie.xml @@ -0,0 +1,193 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-hector-medium.xml b/qcd/part_cpu/applications/QCD/prace-scaling-hector-medium.xml new file mode 100644 index 0000000000000000000000000000000000000000..57ae9c7365f668fcf76eb308be554a183de1fae0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-hector-medium.xml @@ -0,0 +1,192 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-hector-small.xml b/qcd/part_cpu/applications/QCD/prace-scaling-hector-small.xml new file mode 100644 index 0000000000000000000000000000000000000000..8d90296be638349041bcbadbaeecf422d8346cd8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-hector-small.xml @@ -0,0 +1,192 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-hermit.xml b/qcd/part_cpu/applications/QCD/prace-scaling-hermit.xml new file mode 100644 index 0000000000000000000000000000000000000000..94908820eb67a5172cabf59e860daa3bf309b734 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-hermit.xml @@ -0,0 +1,192 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-huygens.xml b/qcd/part_cpu/applications/QCD/prace-scaling-huygens.xml new file mode 100644 index 0000000000000000000000000000000000000000..adcf6b83f2bc6269f13acd8b1e211dc8427fa317 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-huygens.xml @@ -0,0 +1,202 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-jugene.xml b/qcd/part_cpu/applications/QCD/prace-scaling-jugene.xml new file mode 100644 index 0000000000000000000000000000000000000000..09ad25ce3c09ad9ec355b017df99d0e294f3a546 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-jugene.xml @@ -0,0 +1,204 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-juqueen.xml b/qcd/part_cpu/applications/QCD/prace-scaling-juqueen.xml new file mode 100644 index 0000000000000000000000000000000000000000..e09e5907dc13f70738410f9e83376ad04eae2a16 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-juqueen.xml @@ -0,0 +1,204 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-juropa.xml b/qcd/part_cpu/applications/QCD/prace-scaling-juropa.xml new file mode 100644 index 0000000000000000000000000000000000000000..9a64dd3625e4b706d7eae32bc0ed07115a9a6830 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-juropa.xml @@ -0,0 +1,193 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-louhi.xml b/qcd/part_cpu/applications/QCD/prace-scaling-louhi.xml new file mode 100644 index 0000000000000000000000000000000000000000..519be3d8fd83344f08f2d54c4d9586b73d3131e8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-louhi.xml @@ -0,0 +1,185 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-marconi.xml b/qcd/part_cpu/applications/QCD/prace-scaling-marconi.xml new file mode 100644 index 0000000000000000000000000000000000000000..b4bdd614eda85b7501d5e1435a094257bf5b18ce --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-marconi.xml @@ -0,0 +1,194 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-supermuc.xml b/qcd/part_cpu/applications/QCD/prace-scaling-supermuc.xml new file mode 100644 index 0000000000000000000000000000000000000000..908f78c65c1c224dd9093f4184fd8d43b5432555 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-supermuc.xml @@ -0,0 +1,194 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prepare.xml b/qcd/part_cpu/applications/QCD/prepare.xml new file mode 100644 index 0000000000000000000000000000000000000000..04c33977e48bc8455a0097425df15d4bacc0af8e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prepare.xml @@ -0,0 +1,85 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/result.gprof.xml b/qcd/part_cpu/applications/QCD/result.gprof.xml new file mode 100644 index 0000000000000000000000000000000000000000..b1486a3fd9a10889a58be28b26eddebf9c34205b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/result.gprof.xml @@ -0,0 +1,9 @@ + + + KERNELS, GPROF_01_NAME, GPROF_01_PART, GPROF_02_NAME, GPROF_02_PART, GPROF_03_NAME, GPROF_03_PART + + + + name + + diff --git a/qcd/part_cpu/applications/QCD/result.hwc.xml b/qcd/part_cpu/applications/QCD/result.hwc.xml new file mode 100644 index 0000000000000000000000000000000000000000..b965eae3c00291053b4bdc0e3a506e17728c0170 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/result.hwc.xml @@ -0,0 +1,13 @@ + + + KERNELS, walltime, HWC_FLOP_KA_avg, HWC_FLOP_KB_avg, HWC_FLOP_KC_avg + + + + name + + + + diff --git a/qcd/part_cpu/applications/QCD/result.mpi.xml b/qcd/part_cpu/applications/QCD/result.mpi.xml new file mode 100644 index 0000000000000000000000000000000000000000..085cb5e36095059f84d6a558de5b6c66ddc5258e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/result.mpi.xml @@ -0,0 +1,9 @@ + + + KERNELS, walltime, MPI_COMM_TIME_avg, MPI_COMM_TIME_std + + + + name + + diff --git a/qcd/part_cpu/applications/QCD/result.wct.xml b/qcd/part_cpu/applications/QCD/result.wct.xml new file mode 100644 index 0000000000000000000000000000000000000000..faf08f81146ee3bd87fb054851fb30d3352d7015 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/result.wct.xml @@ -0,0 +1,9 @@ + + + KERNELS, walltime, KA_wct, KB_wct, KC_wct, KD_wct + + + + name + + diff --git a/qcd/part_cpu/applications/QCD/result.xml b/qcd/part_cpu/applications/QCD/result.xml new file mode 100644 index 0000000000000000000000000000000000000000..429be125b68e6e3317ef42d14de491715737cd24 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/result.xml @@ -0,0 +1,64 @@ + + + + ncpus, time, time_KA, time_KB, time_KC, time_KD, time_KE + + + + + + ncpus, KERNELS, optflags, walltime, WCT_KA, WCT_KB, WCT_KC, WCT_KD, WCT_KE, COMMENT + + + + ncpus, KERNELS, walltime, TIME_INIT_KA, TIME_RUN_KA, TIME_FINALIZE_KA, KA_NX, KA_NY, KA_NZ, KA_NT + + + ncpus, KERNELS, walltime, TIME_INIT_KB, TIME_RUN_KB, TIME_FINALIZE_KB, KB_NX, KB_NY, KB_NZ + + + ncpus, KERNELS, walltime, TIME_INIT_KC, TIME_RUN_KC, TIME_FINALIZE_KC, KC_NX, KC_NY, KC_NZ, KC_NT + + + ncpus, KERNELS, walltime, TIME_INIT_KD, TIME_RUN_KD, TIME_FINALIZE_KD, KD_L, KD_T + + + ncpus, KERNELS, walltime, TIME_INIT_KE, TIME_RUN_KE, TIME_FINALIZE_KE, KE_NX, KE_NY, KE_NZ, KE_NT + + + + + KERNELS, walltime, MEM_MAX, MEM_KA, MEM_KB, MEM_KC, MEM_KD, MEM_KE, COMMENT + + + + KERNELS, GPROF_01_NAME, GPROF_01_PART, GPROF_02_NAME, GPROF_02_PART, GPROF_03_NAME, GPROF_03_PART + + + + KERNELS, walltime, HWC_FLOP_KA_avg, HWC_FLOP_KB_avg, HWC_FLOP_KC_avg, HWC_FLOP_KD_avg, HWC_FLOP_KE_avg + + + + KERNELS, walltime, MPI_COMM_TIME + + + + + ncpus, bgconn, optflags, KA_NPROC0, KA_NPROC1, KA_NPROC2, KA_NPROC3 + + + + ncpus, bgconn, walltime, KA_LATTICE0, KA_LATTICE1, KA_LATTICE2, KA_LATTICE3, KA_LIBCOMM, KA_LIBCLOVER, KA_LIBD + + + + jobenddate + + + + + KERNELS, ncpus, subid + + + diff --git a/qcd/part_cpu/applications/QCD/run/collectData.sh.in b/qcd/part_cpu/applications/QCD/run/collectData.sh.in new file mode 100644 index 0000000000000000000000000000000000000000..eb4483efcef2739e4964915f0b7bb394ff1687dd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/run/collectData.sh.in @@ -0,0 +1,24 @@ +rm -f IHPCT.log GPROF.log CRAYPAT.log + +##COLLECT_IHPCT_HWC# #JUGENE# sed '$d' < QCD.viz > tmp.viz; mv tmp.viz QCD.viz + #COLLECT_IHPCT_HWC# #PERL# #BENCHHOME#/../../utils/ihpct/parseHWC.pl QCD*.viz >> IHPCT.log + +#COLLECT_IHPCT_MPITR# #PERL# #BENCHHOME#/../../utils/ihpct/parseMPITR.pl mpi_profile.* >> IHPCT.log + +#COLLECT_GPROF# #HUYGENS# gprof #EXECUTABLE# profdir*/gmon.out > GPROF.dat; #PERL# #BENCHHOME#/../../utils/gprof/parseGPROF.pl -1 1 GPROF.dat > GPROF.log +#COLLECT_GPROF# #JUGENE# gprof #EXECUTABLE# gmon.out.* > GPROF.dat; #PERL# #BENCHHOME#/../../utils/gprof/parseGPROF.pl -1 1 GPROF.dat > GPROF.log +#COLLECT_GPROF# #JUMP# gprof #EXECUTABLE# gmon.*.out > GPROF.dat; #PERL# #BENCHHOME#/../../utils/gprof/parseGPROF.pl -2 1 GPROF.dat > GPROF.log + +#COLLECT_CRAYPAT# source /opt/modules/default/init/sh; module load xt-craypat + +#COLLECT_CRAYPAT# pat_report -d P -b totals *.xf > CRAYPAT.HWC.dat; pat_report -d P -b totals ./*/*.xf > CRAYPAT.HWC.dat; +#COLLECT_CRAYPAT# #PERL# #BENCHHOME#/../../utils/craypat/parseCRAYPAT.pl HWC CRAYPAT.HWC.dat >> CRAYPAT.log + +#COLLECT_CRAYPAT# pat_report -d flops *.xf > CRAYPAT.FLOPS.dat; pat_report -d flops ./*/*.xf > CRAYPAT.FLOPS.dat; +#COLLECT_CRAYPAT# #PERL# #BENCHHOME#/../../utils/craypat/parseCRAYPAT.pl FLOPS CRAYPAT.FLOPS.dat >> CRAYPAT.log + +#COLLECT_CRAYPAT# pat_report -d am *.xf > CRAYPAT.HEAP.dat; pat_report -d am ./*/*.xf > CRAYPAT.HEAP.dat; +#COLLECT_CRAYPAT# #PERL# #BENCHHOME#/../../utils/craypat/parseCRAYPAT.pl HEAP CRAYPAT.HEAP.dat >> CRAYPAT.log + +#COLLECT_CRAYPAT# pat_report -d time@%0.1 *.xf > CRAYPAT.TIME.dat; pat_report -d time@%0.1 ./*/*.xf > CRAYPAT.TIME.dat; +#COLLECT_CRAYPAT# #PERL# #BENCHHOME#/../../utils/craypat/parseCRAYPAT.pl TIME CRAYPAT.TIME.dat >> CRAYPAT.log diff --git a/qcd/part_cpu/applications/QCD/run/verify_qcd.pl b/qcd/part_cpu/applications/QCD/run/verify_qcd.pl new file mode 100644 index 0000000000000000000000000000000000000000..4626ce67b54506c73fed26614c559401d85c55f0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/run/verify_qcd.pl @@ -0,0 +1,32 @@ +#!/usr/local/bin/perl -w + +use strict; +use Carp; + +my $patint="([\\+\\-\\d]+)"; # Pattern for Integer number +my $patfp ="([\\+\\-\\d.Ee]+)"; # Pattern for Floating Point number +my $patwrd="([\^\\s]+)"; # Pattern for Work (all noblank characters) +my $patnint="[\\+\\-\\d]+"; # Pattern for Integer number, no () +my $patnfp ="[\\+\\-\\d.Ee]+"; # Pattern for Floating Point number, no () +my $patnwrd="[\^\\s]+"; # Pattern for Work (all noblank characters), no () +my $patbl ="\\s+"; # Pattern for blank space (variable length) + +if((scalar @ARGV) != 1) { + printf(STDERR "incorrect number of parameter (%d) of $0 (6 required)\n",scalar @ARGV); + exit(-1); +} + +my $xmloutfile = $ARGV[0]; +my $vcheck=0; +my $vcomment="not implemented"; + +open(XMLOUT,"> $xmloutfile") || die "cannot open file $xmloutfile"; +print XMLOUT "\n"; +print XMLOUT " \n"; +print XMLOUT " \n"; +print XMLOUT "\n"; +print XMLOUT "\n"; +close(XMLOUT); + + +exit(0); diff --git a/qcd/part_cpu/applications/QCD/sizes.pp b/qcd/part_cpu/applications/QCD/sizes.pp new file mode 100644 index 0000000000000000000000000000000000000000..d0e69102317db6ed0c92dc78234647091a8fbe81 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/sizes.pp @@ -0,0 +1,7 @@ +medium problem size + + * KA_NX=KA_NY=32, KA_NZ=KA_NT=64 + * KB_NX=KB_NY=KB_NC=256 + * KC_NX=KC_NY=KC_NZ=KC_NT=8 + * KD_L=KD_T=64 + * KE_NX=KE_NY=KE_NZ=64, KE_NT=32 \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/Makefile.in b/qcd/part_cpu/applications/QCD/src/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..f846472788e0196aee61ca5fad45b646474647d0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/Makefile.in @@ -0,0 +1,35 @@ +KERNEL_ARCHS = #KERNEL_ARCHS# + +qcd-bench: qcd-bench.o $(KERNEL_ARCHS) + #LD# #LDFLAGS# -o #EXECNAME# qcd-bench.o $(KERNEL_ARCHS) #LDLIBS# + +kernel_A.a: + cd kernel_A && gmake prep-#KA_PLATFORM# && gmake kernel + +kernel_B.a: + cd kernel_B && gmake kernel + +kernel_C.a: + cd kernel_C && make kernel + +kernel_D.a: + cd kernel_D && make kernel + +kernel_E.a: + cd kernel_E && gmake kernel + +qcd-bench.o: qcd-bench.c + #MPI_CC# #CFLAGS# -c -o qcd-bench.o qcd-bench.c + +clean: + cd kernel_A && gmake clobber + cd kernel_B && make clean + cd kernel_C && make clean + cd kernel_D && make clean + cd kernel_E && gmake clean + #RM# -f qcd-bench.o qcd-bench + #RM# -f $(KERNEL_ARCHS) + +#nm -X64 *.a | grep ' T ' | cut -f 1 -d ' ' | sort > all.sort.dat +#nm -X64 *.a | grep ' T ' | cut -f 1 -d ' ' | sort | sort -u > all.sort.unique.dat +#diff all.sort.dat all.sort.unique.dat | grep '<' > doublicates.dat \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/exchangeNames.sh b/qcd/part_cpu/applications/QCD/src/exchangeNames.sh new file mode 100644 index 0000000000000000000000000000000000000000..b1ab0f9efbf8d47b6e9ffd8d90d7856a31cb71c7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/exchangeNames.sh @@ -0,0 +1,44 @@ +replaceInFile () +{ + sed "s/${1}(/${2}(/g" $3 > tmp.dat; cp tmp.dat $3 +} + +case $1 in + "doublicates") echo "generate doublicate list"; + echo "parse files:\n`ls *.a`"; + nm -X64 *.a | grep ' T ' | cut -f 1 -d ' ' | sort > all.sort.dat + nm -X64 *.a | grep ' T ' | cut -f 1 -d ' ' | sort | sort -u > all.sort.unique.dat + diff all.sort.dat all.sort.unique.dat | grep '<' | cut -f 2- -d '.'> doublicates.dat + echo "found `wc -l doublicates.dat` doublicates" ;; + + "find") echo "looking for $2"; + grep -r $2 `find . -name '*.[c|h|f|f90|F90]'` | cut -f 2- -d '<' > find.dat; + cat find.dat;; + + "replace") echo "replace $2 by $3 in $4"; + replaceInFile $2 $3 $4;; + + "replaceAllFiles") echo "replace all $2 by $3"; + grep -r $2 `find . -name '*.[c|h|f|f90|F90]'` | cut -f 2- -d '<' | cut -f 1 -d ':' | sort -u > find.dat; + for i in `cat find.dat` + do + echo "replacing $2 by $3 in $i"; + replaceInFile $2 $3 $i + done + ;; + + "replaceAll") + echo "replace all doublicates in this directory, using doublicate list $2 and postfix $3"; + for dn in `cat $2` + do + echo "replace $dn" + grep -r $dn `find . -name '*.[c|h|f|f90|F90]'` | cut -f 2- -d '<' | cut -f 1 -d ':' | sort -u > find.dat; + for fn in `cat find.dat` + do + echo "replacing $dn by ${dn}_$3 in $fn"; + replaceInFile ${dn} ${dn}_${3} ${fn} + done + done + ;; + +esac \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..7aab461bd39a9afb5243cf59d7563fbf804fe00e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile @@ -0,0 +1,185 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile +# +#=============================================================================== + +include Makefile.defs + +MODULES_DIR = modules + +.SUFFIXES: +.SUFFIXES: .o .F90 .c + + +.F90.f90: + $(FPP) $(FPPFLAGS) $< > $@ + +.F90.o: + $(FPP) $(FPPFLAGS) $< > $*.f90 + $(F90) -c $(FFLAGS) -I$(MODULES_DIR) $*.f90 + + +MODULES = $(MODULES_DIR)/*.o + +OBJS = \ + action.o \ + cg.o \ + checks.o \ + $(CKSUM_O) \ + conf.o \ + conf_info.o \ + cooling.o \ + dsd.o \ + dsg.o \ + dsf.o \ + dsf1.o \ + dsf2.o \ + files.o \ + flip_bc.o \ + hmc.o \ + hmc_init_p.o \ + hmc_init_phi.o \ + hmc_integrator.o \ + hmc_forces.o \ + hmc_leap_frog.o \ + hmc_test.o \ + hmc_u.o \ + h_mult.o \ + index.o \ + index2.o \ + init_common.o \ + init_modules.o \ + iteration_count.o \ + mc.o \ + misc.o \ + mre.o \ + mtdagmt.o \ + m_tilde.o \ + polyakov_loop.o \ + $(RANDOM_O) \ + sc.o \ + service.o \ + staple.o \ + su3.o \ + swap.o \ + timing.o \ + traces.o \ + $(UUU_O) \ + w_mult.o \ + xyzt2i.o + +LIBS = d/$(LIBD) comm/$(LIBCOMM) clover/$(LIBCLOVER) + +kernel: + cd modules && $(MAKE) + cd d && $(MAKE) fast + cd comm && $(MAKE) fast + cd clover && $(MAKE) fast + $(MAKE) ../kernel_A.a + +../kernel_A.a: bqcd.o $(MODULES) $(OBJS) $(LIBS) + $(AR) $(ARFLAGS) ../kernel_A.a *.o d/*.o modules/*.o comm/*.o clover/*.o + +bqcd: bqcd.o $(MODULES) $(OBJS) $(LIBS) + $(F90) -o $@ $(LDFLAGS) bqcd.o $(MODULES) $(OBJS) $(LIBS) $(SYSLIBS) + + +fast: + cd modules && $(MAKE) + cd d && $(MAKE) fast + cd comm && $(MAKE) fast + cd clover && $(MAKE) fast + $(FAST_MAKE) bqcd + +clean: + rm -f bqcd.[0-9][0-9][0-9].* diag.[0-9][0-9] core app.rif + rm -f random_test random_test.dump random_test.out + rm -f test_echo + rm -f a.out out out1 out2 + rm -f ../kernel-bqcd.a + +tidy: clean + rm -f *.[Toid] *.f90 *.mod work.pc work.pcl + +clobber: tidy + rm -f bqcd + $(MAKE) clobber_libs + cd modules && $(MAKE) clean + +Modules: + cd modules && $(MAKE) + +libd: + cd d && $(MAKE) + +libclover: $(MODULES) + cd clover && $(MAKE) + +libs: + cd d && $(MAKE) + cd comm && $(MAKE) + cd clover && $(MAKE) + +clean_libs: + cd d && $(MAKE) clean + cd comm && $(MAKE) clean + cd clover && $(MAKE) clean + +clobber_libs: + cd d && $(MAKE) clobber + cd comm && $(MAKE) clobber + cd clover && $(MAKE) clobber + +the_ranf_test: ranf.o + $(FPP) $(FPPFLAGS) ranf_test.F90 ranf_test.f90 + $(F90) ranf_test.f90 ranf.o + ./a.out | diff - ranf_test.reference + +test_echo: test_echo.o service.o + $(F90) -o $@ $(LDFLAGS) test_echo.o service.o + +prep: + rm -f Makefile.var service.F90 + ln -s platform/Makefile-$(PLATFORM).var Makefile.var + ln -s platform/service-$(PLATFORM).F90 service.F90 + +prep-altix: + $(MAKE) prep PLATFORM=altix + +prep-bgl: + $(MAKE) prep PLATFORM=bgl + +prep-cray: + $(MAKE) prep PLATFORM=cray + +prep-hitachi: + $(MAKE) prep PLATFORM=hitachi + +prep-hitachi-omp: + $(MAKE) prep PLATFORM=hitachi-omp + +prep-ibm: + $(MAKE) prep PLATFORM=ibm + +prep-hp: + $(MAKE) prep PLATFORM=hp + +prep-intel: + $(MAKE) prep PLATFORM=intel + +prep-nec: + $(MAKE) prep PLATFORM=nec + +prep-sun: + $(MAKE) prep PLATFORM=sun + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.defs.in b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.defs.in new file mode 100644 index 0000000000000000000000000000000000000000..574fc5d616c56d9a63511edd4fdc39102cead3f5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.defs.in @@ -0,0 +1,32 @@ +d3_buffer_vol = #d3_buffer_vol# + +SHELL = #SHELL# + +FPP = #FPP# +FPPFLAGS = #FPPFLAGS# + +F90 = #MPI_F90# +FFLAGS = #F90FLAGS# + +CC = #MPI_CC# +CFLAGS = #CFLAGS# + +AR = #AR# +ARFLAGS = #ARFLAGS# + +RANLIB = #RANLIB# + +LDFLAGS = #LDFLAGS# +SYSLIBS = #SYSLIBS# + +FAST_MAKE = #FAST_MAKE# + +RM = #RM# + +CKSUM_O = #CKSUM_O# +RANDOM_O = #RANDOM_O# +UUU_O = #UUU_O# + +LIBD = #KA_LIBD# +LIBCOMM = #KA_LIBCOMM# +LIBCLOVER = #KA_LIBCLOVER# \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.in b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..b1ee85d3623d75eb8c53b99f22757e8de8c6e937 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.in @@ -0,0 +1,176 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile +# +#=============================================================================== + +#include Makefile.var +include Makefile.defs + +.SUFFIXES: +.SUFFIXES: .o .F90 .c + + +.F90.f90: + $(FPP) $(FPPFLAGS) $< > $@ + +.F90.o: + $(FPP) $(FPPFLAGS) $< > $*.f90 + $(F90) -c $(FFLAGS) $*.f90 + +MODULES_DIR = modules + +MODULES = modules/*.o + +OBJS = \ + action.o \ + cg.o \ + checks.o \ + $(CKSUM_O) \ + conf.o \ + conf_info.o \ + cooling.o \ + dsd.o \ + dsg.o \ + dsf.o \ + dsf1.o \ + dsf2.o \ + files.o \ + flip_bc.o \ + hmc.o \ + hmc_init_p.o \ + hmc_init_phi.o \ + hmc_integrator.o \ + hmc_forces.o \ + hmc_leap_frog.o \ + hmc_test.o \ + hmc_u.o \ + h_mult.o \ + index.o \ + index2.o \ + init_common.o \ + init_modules.o \ + iteration_count.o \ + mc.o \ + misc.o \ + mre.o \ + mtdagmt.o \ + m_tilde.o \ + polyakov_loop.o \ + $(RANDOM_O) \ + sc.o \ + service.o \ + staple.o \ + su3.o \ + swap.o \ + timing.o \ + traces.o \ + $(UUU_O) \ + w_mult.o \ + xyzt2i.o + +LIBS = d/$(LIBD) comm/$(LIBCOMM) clover/$(LIBCLOVER) + +bqcd: bqcd.o $(MODULES) $(OBJS) $(LIBS) + $(F90) -o $@ $(LDFLAGS) bqcd.o $(MODULES) $(OBJS) $(LIBS) $(SYSLIBS) + + +fast: + cd modules && $(MAKE) + cd d && $(MAKE) fast + cd comm && $(MAKE) fast + cd clover && $(MAKE) fast + $(FAST_MAKE) bqcd + mv bqcd #EXECNAME# +# mv bqcd ../BQCD_SGI-ALTIX_cname_SGI-ALTIX.exe + +clean: + rm -f bqcd.[0-9][0-9][0-9].* diag.[0-9][0-9] core app.rif + rm -f random_test random_test.dump random_test.out + rm -f test_echo + rm -f a.out out out1 out2 + +tidy: clean + rm -f *.[Toid] *.f90 *.mod work.pc work.pcl + +clobber: tidy + rm -f bqcd + $(MAKE) clobber_libs + cd modules && $(MAKE) clean + +Modules: + cd modules && $(MAKE) + +libd: + cd d && $(MAKE) + +libclover: $(MODULES) + cd clover && $(MAKE) + +libs: + cd d && $(MAKE) + cd comm && $(MAKE) + cd clover && $(MAKE) + +clean_libs: + cd d && $(MAKE) clean + cd comm && $(MAKE) clean + cd clover && $(MAKE) clean + +clobber_libs: + cd d && $(MAKE) clobber + cd comm && $(MAKE) clobber + cd clover && $(MAKE) clobber + +the_ranf_test: ranf.o + $(FPP) $(FPPFLAGS) ranf_test.F90 ranf_test.f90 + $(F90) ranf_test.f90 ranf.o + ./a.out | diff - ranf_test.reference + +test_echo: test_echo.o service.o + $(F90) -o $@ $(LDFLAGS) test_echo.o service.o + +prep: + rm -f Makefile.var service.F90 + ln -s platform/Makefile-$(PLATFORM).var Makefile.var + ln -s platform/service-$(PLATFORM).F90 service.F90 + +prep-altix: + $(MAKE) prep PLATFORM=altix + +prep-bgl: + $(MAKE) prep PLATFORM=bgl + +prep-cray: + $(MAKE) prep PLATFORM=cray + +prep-hitachi: + $(MAKE) prep PLATFORM=hitachi + +prep-hitachi-omp: + $(MAKE) prep PLATFORM=hitachi-omp + +prep-ibm: + $(MAKE) prep PLATFORM=ibm + +prep-hp: + $(MAKE) prep PLATFORM=hp + +prep-intel: + $(MAKE) prep PLATFORM=intel + +prep-nec: + $(MAKE) prep PLATFORM=nec + +prep-sun: + $(MAKE) prep PLATFORM=sun + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.var b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.var new file mode 100644 index 0000000000000000000000000000000000000000..4058c3904be9f1b63ec9cc2a5939e6a3ac34f158 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.var @@ -0,0 +1,108 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 2005, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-altix.var - settings on SGI-Altix +# +#------------------------------------------------------------------------------- + +timing = 1 +mpi = 1 +omp = 1 +shmem = +shmempi = +debug = +libd = 2 +d3_buffer_vol = 32*32*16*16 + +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +FPP = mpif90 -g -E +FPP2 = icc -E -C -P +F90 = mpif90 +CC = mpicc +AR = ar +RANLIB = echo + +MODULES_FLAG = -I$(MODULES_DIR) + +MYFLAGS = -DINTEL -DALTIX +FFLAGS_STD= $(MODULES_FLAG) +CFLAGS_STD= -DNamesToLower_ +ARFLAGS = rv + +LDFLAGS = -Vaxlib +SYSLIBS = + +FAST_MAKE = gmake -j 8 + +CKSUM_O = cksum.o +RANDOM_O = ran.o ranf.o +UUU_O = uuu_f90.o + +LIBD = +LIBCOMM = lib_single_pe.a +LIBCLOVER = libclover.a + +#------------------------------------------------------------------------------- + +ifdef timing + MYFLAGS += -DTIMING +endif + +ifdef mpi + LIBCOMM = lib_mpi.a +endif + +ifdef omp + F90 += -openmp + MYFLAGS += -D_OPENMP +endif + +ifdef shmem + LDFLAGS += -lsma + LIBCOMM = lib_shmem.a +endif + +ifdef shmempi + LDFLAGS += -lsma + LIBCOMM = lib_shmempi.a +endif + +ifdef debug + FFLAGS = -g -O0 $(FFLAGS_STD) + CFLAGS = -g -O0 $(CFLAGS_STD) +else + FFLAGS = -O2 $(FFLAGS_STD) + CFLAGS = -O2 $(CFLAGS_STD) +endif + +ifeq ($(libd),1) + LIBD = libd.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),2) + LIBD = libd2.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),21) + LIBD = libd21.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),3) + LIBD = libd3.a + MYFLAGS += -DD3_BUFFER_VOL='$(d3_buffer_vol)' +endif + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/action.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/action.F90 new file mode 100644 index 0000000000000000000000000000000000000000..24ad7a1c5268a8a0adc7042d49432513986f4769 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/action.F90 @@ -0,0 +1,130 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! action.F90 - calculation of actions +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +REAL function sf(para, conf) ! returns S_f + + use typedef_hmc + use module_p_interface + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + + P_SPINCOL_FIELD, save :: a, b + + REAL, external :: dotprod, clover_action + integer :: iterations + external :: mtdagmt + + if (para%kappa == ZERO) then + sf = ZERO + else + ALLOCATE_SC_FIELD(a) + ALLOCATE_SC_FIELD(b) + + call flip_bc(conf%u) + + call sc_copy(a, conf%phi) ! A = phi + + call cg(mtdagmt, a, conf%phi, para, conf, iterations) ! A = inv(M~+ M~) Phi + call mtil(b, a, para, conf) ! B = M~ A + + sf = dotprod(b, b, SIZE_SC_FIELD) + + call flip_bc(conf%u) + endif + + if (para%csw_kappa /= ZERO) sf = sf + clover_action(conf%b(1,1,ODD)) +end + +!------------------------------------------------------------------------------- +REAL function sg(u) ! returns S_g + + use module_nn + use module_vol + implicit none + + GAUGE_FIELD :: u + REAL :: plaq, global_sum, p + SU3 :: uuu + integer :: i, e, o, mu, nu, j1, j2 + REAL, external :: Re_Tr_uu + + TIMING_START(timing_bin_plaq) + + plaq = 0 + + do mu = 1, DIM + do e = EVEN, ODD + o = EVEN + ODD - e + do nu = mu + 1, DIM + p = ZERO + !$omp parallel do reduction(+: p) private(j1, j2, uuu) + do i = 1, VOLH + + ! (j2,o) --<-- x nu + ! | | + ! v ^ ^ + ! | | | + ! (i,e) -->-- (j1,o) x--> mu + + + j1 = nn(i, e, mu, FWD) + j2 = nn(i, e, nu, FWD) + + uuu = 0 + call uuu_fwd(uuu, u(1, 1, j1, o, nu), & + u(1, 1, j2, o, mu), & + u(1, 1, i, e, nu)) + + p = p + Re_Tr_uu(uuu, u(1, 1, i, e, mu)) + enddo + !$omp end parallel do + plaq = plaq + p + enddo + enddo + enddo + + plaq = global_sum(plaq) + + sg = (6 * volume) - plaq / THREE + + TIMING_STOP(timing_bin_plaq) + +end + +!------------------------------------------------------------------------------- +REAL function sp(p) ! returns action of momenta p + + use module_vol + implicit none + REAL, external :: dotprod + GENERATOR_FIELD :: p + + integer :: mu, eo + + sp = ZERO + do mu = 1, DIM + do eo = EVEN, ODD + sp = sp + dotprod(p(1, 1, eo, mu), p(1, 1, eo, mu), NGEN * volh) + enddo + enddo + sp = sp * HALF + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/bqcd.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/bqcd.F90 new file mode 100644 index 0000000000000000000000000000000000000000..9141c4c2f7c6ad61a5a654ffd8f896df441d13aa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/bqcd.F90 @@ -0,0 +1,534 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! bqcd.F90 - main program and read/write of parameters +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- + +! JuBE +! use kernel_a as a subroutine in the qcd-bench, this was the main function +! in the original code +subroutine kernel_a() + + use typedef_flags + use typedef_para + use module_input + use module_function_decl + implicit none + + type(type_para) :: para + type(hmc_conf), dimension(MAX_TEMPER) :: conf + type(type_flags) :: flags + SECONDS :: time0, sekunden + integer :: kernel_number + + kernel_number = 0 + +! JuBE +! call jube initial function + call jube_kernel_init(kernel_number) + +! JuBE +! set the flags%input to the inputfile name: bqcd-input + flags%input = "kernel_A.input" + + time0 = sekunden() ! start/initialize timer + + TIMING_START(timing_bin_total) + + call comm_init() + +! JuBE +! there is no need for the following call, we ignore all cmd line arguments, non +! of them but the input file name (set above) is relevant for the benchmark +! call get_flags(flags) + + call begin(UREC, "Job") + call input_read(flags%input) + call init_para(para, flags) + call init_counter(para, flags) + call init_ran(para, flags) + call init_cooling(input%measure_cooling_list) + + call set_fmt_ensemble(para%n_temper) + call check_fmt(para%run, para%n_temper, para%maxtraj, para%L(4) - 1) + + call init_common(para) + call init_modules() + + call write_header(para) + + call init_flip_bc() + call init_cg_para(para%cg_rest, para%cg_maxiter, para%cg_log) + call init_cg_stat() + call init_xbound() + call init_confs(para, conf) + + call check_former(para%n_temper, conf) + + +! JuBE +! call jube kernel run function + call jube_kernel_run() + + call mc(para, conf) + !!call xbound_test() + +! JuBE +! call jube kernel finalize function + call jube_kernel_finalize() + + + call conf_write(.true., para, conf) + + call write_counter(para%maxtraj) + call write_ran() + + TIMING_STOP(timing_bin_total) + + call write_footer(time0) + call end_A(UREC, "Job") + + call comm_finalize() + +! JuBE +! call jube kernel end function + call jube_kernel_end() + +end subroutine kernel_a + +!------------------------------------------------------------------------------- +subroutine init_para(para, flags) + + ! initialises module_para, module_switches and module_mre + + use typedef_flags + use typedef_para + use module_bqcd + use module_input + use module_mre + use module_switches + implicit none + + type(type_para) :: para + type(type_flags) :: flags + integer :: i + logical :: quenched, dynamical, clover, h_ext + + quenched = .false. + dynamical = .false. + clover = .false. + h_ext = .false. + + para%run = input%run + para%L = input%lattice + para%NPE = input%processes + para%bc_fermions = input%boundary_conditions_fermions + para%gamma_index = input%gamma_index + para%n_temper = input%ensembles + para%nstd = input%tempering_steps_without + para%nforce = input%hmc_accept_first + para%ntraj = input%mc_steps + para%maxtraj = input%mc_total_steps + para%nsave = input%mc_save_frequency + para%c_cg_rest = input%solver_rest + para%cg_maxiter = input%solver_maxiter + para%cg_log = input%solver_ignore_no_convergence + mre_n_vec = input%solver_mre_vectors + + call check_bc_fermions(para%bc_fermions, para%gamma_index) + + read(para%c_cg_rest, *) para%cg_rest + + if (para%n_temper <= 0) call die("init_para(): n_temper <= 0") + if (para%n_temper > MAX_TEMPER) call die("init_para(): n_temper > max_temper") + + do i = 1, para%n_temper + para%c_hmc(i)%beta = input%beta(i) + para%c_hmc(i)%kappa = input%kappa(i) + para%c_hmc(i)%csw = input%csw(i) + para%c_hmc(i)%h = input%h(i) + para%c_hmc(i)%traj_length = input%hmc_trajectory_length(i) + para%c_hmc(i)%ntau = input%hmc_steps(i) + para%c_hmc(i)%rho = input%hmc_rho(i) + para%c_hmc(i)%m_scale = input%hmc_m_scale(i) + para%info_file(i) = input%start_info_file(i) + + read(para%c_hmc(i)%beta, *) para%hmc(i)%beta + read(para%c_hmc(i)%kappa, *) para%hmc(i)%kappa + read(para%c_hmc(i)%csw, *) para%hmc(i)%csw + read(para%c_hmc(i)%h, *) para%hmc(i)%h + read(para%c_hmc(i)%traj_length,*) para%hmc(i)%traj_length + read(para%c_hmc(i)%ntau, *) para%hmc(i)%ntau + read(para%c_hmc(i)%rho, *) para%hmc(i)%rho + read(para%c_hmc(i)%m_scale, *) para%hmc(i)%m_scale + + if (para%hmc(i)%kappa == ZERO .and. para%hmc(i)%csw /= ZERO) then + para%hmc(i)%csw_kappa = para%hmc(i)%csw + para%c_hmc(i)%csw = "-1 (infinity)" + para%hmc(i)%csw = -1 + else + para%hmc(i)%csw_kappa = para%hmc(i)%csw * para%hmc(i)%kappa + call check_csw(para%hmc(i)%beta, para%hmc(i)%csw) + endif + + para%hmc(i)%tau = para%hmc(i)%traj_length / para%hmc(i)%ntau + + write(para%c_hmc(i)%csw_kappa, "(f20.15)") para%hmc(i)%csw_kappa + write(para%c_hmc(i)%tau, "(f20.15)") para%hmc(i)%tau + + if (para%hmc(i)%kappa == ZERO .and. para%hmc(i)%csw == ZERO) then + quenched = .true. + else + dynamical = .true. + endif + + if (para%hmc(i)%csw /= ZERO) clover = .true. + if (para%hmc(i)%h /= ZERO) h_ext = .true. + + para%hmc(i)%model = input%hmc_model + + if (para%hmc(i)%model == "A" .and. para%hmc(i)%rho /= ZERO) then + call warn("init_para(): model == A but rho /= 0") + endif + + if (para%hmc(i)%model /= "A" .and. para%hmc(i)%rho == ZERO) then + call warn("init_para(): model /= A but rho == 0") + endif + enddo + + select case (input%start_configuration) + case ("hot"); para%start = START_HOT + case ("cold"); para%start = START_COLD + case ("file"); para%start = START_FILE + case default + call die("init_para(): start_configuration must be {hot|cold|file}") + end select + + select case (input%start_random) + case ("random"); para%seed = -1 + case ("default"); para%seed = 0 + case default; read(input%start_random, *) para%seed + end select + + select case (input%tempering_swap_sequence) + case ("up"); para%swap_seq = SWAP_UP + case ("down"); para%swap_seq = SWAP_DOWN + case ("random"); para%swap_seq = SWAP_RANDOM + case default + call die("init_para(): tempering_swap_sequence must be {up|down|random}") + end select + + if (quenched .and. dynamical) call die("init_para(): quenched/dynamical mixed") + + if (para%nforce < 0) call die("init_para(): nforce < 0") + + if (flags%continuation_job) para%start = START_CONT + + + switches%quenched = quenched + switches%dynamical = dynamical + switches%clover = clover + switches%h_ext = h_ext + switches%hasenbusch = (input%hmc_model /= "A") + + if (quenched) switches%hasenbusch = .false. + + switches%tempering = .false. + switches%measure_polyakov_loop = .false. + switches%measure_traces = .false. + + if (input%ensembles > 1) switches%tempering = .true. + if (input%measure_polyakov_loop /= 0) switches%measure_polyakov_loop = .true. + if (input%measure_traces /= 0) switches%measure_traces = .true. + + if (input%hmc_test == 0) then + switches%hmc_test = .false. + else + switches%hmc_test = .true. + endif + +end subroutine init_para + +!------------------------------------------------------------------------------- +subroutine init_counter(para, flags) + + use typedef_flags + use typedef_para + use module_counter + use module_function_decl + implicit none + + type(type_para) :: para + type(type_flags) :: flags + FILENAME, external :: count_file, stop_file + + if (f_exist(stop_file())) then + call die("init_counter(): found stop file " // stop_file()) + endif + + counter%run = para%run + counter%j_traj = 0 + + if (flags%continuation_job) then + open(UCOUNT, file = count_file(), action = "read", status = "old") + read(UCOUNT, *) counter%run + read(UCOUNT, *) counter%job + read(UCOUNT, *) counter%traj + close(UCOUNT) + + if (counter%run /= para%run) call die("init_counter(): RUN inconsistent") + counter%job = counter%job + 1 + else + counter%run = para%run + counter%job = 1 + counter%traj = -para%nforce + endif + +end subroutine init_counter + +!------------------------------------------------------------------------------- +subroutine write_counter(maxtraj) + + use module_counter + use module_function_decl + implicit none + + integer :: maxtraj + FILENAME, external :: count_file, stop_file + + if (my_pe() /= 0) return + + open(UCOUNT, file = count_file(), action = "write") + write(UCOUNT, *) counter%run, " run" + write(UCOUNT, *) counter%job, " job" + write(UCOUNT, *) counter%traj, " traj" + close(UCOUNT) + + if (counter%traj >= maxtraj) then + open(UCOUNT, file = stop_file(), status = "unknown") + close(UCOUNT) + endif + +end subroutine write_counter + +!------------------------------------------------------------------------------- +subroutine write_header(para) + + use typedef_para + use module_bqcd + use module_counter + use module_function_decl + use module_input + use module_mre + use module_thread + implicit none + + type(type_para) :: para + integer :: i + character(len = 50) :: fmt + character(len = 4), external :: format_ensemble + + if (my_pe() == 0) then + + fmt = "(1x,a," // format_ensemble() // ",2a)" + + call begin(UREC, "Header") + + if (input%comment /= "") then + write(UREC, 405) "Comment", trim(input%comment) + endif + + write(UREC, 400) "Program", prog_name, prog_version + write(UREC, *) "Version_of_D ", version_of_d() + write(UREC, *) "Communication ", trim(comm_method()) + write(UREC, *) "Run ", para%run + write(UREC, *) "Job ", counter%job + write(UREC, 405) "Host", rechner() + write(UREC, 400) "Date", datum(), uhrzeit() + write(UREC, 410) "L ", para%L + write(UREC, 410) "NPE ", para%NPE + write(UREC, 410) "bc_fermions", para%bc_fermions + write(UREC, 410) "gamma_index", para%gamma_index + + + write(UREC, *) "Threads ", n_thread + write(UREC, *) "Start ", para%start + + if (para%start == START_FILE) then + do i = 1, para%n_temper + write(UREC, fmt) "StartConf_", i, " ", trim(para%info_file(i)) + enddo + endif + + write(UREC, *) "Seed ", para%seed + write(UREC, *) "Swap_seq", para%swap_seq + write(UREC, *) "N_force ", para%nforce + write(UREC, *) "N_traj ", para%ntraj + write(UREC, *) "N_save ", para%nsave + write(UREC, *) "N_temper", para%n_temper + + do i = 1, para%n_temper + write(UREC, fmt) "beta_", i, " ", trim(para%c_hmc(i)%beta) + write(UREC, fmt) "kappa_", i, " ", trim(para%c_hmc(i)%kappa) + write(UREC, fmt) "csw_", i, " ", trim(para%c_hmc(i)%csw) + write(UREC, fmt) "csw_kappa_", i, " ", trim(para%c_hmc(i)%csw_kappa) + write(UREC, fmt) "h_", i, " ", trim(para%c_hmc(i)%h) + write(UREC, fmt) "tau_", i, " ", trim(para%c_hmc(i)%tau) + write(UREC, fmt) "N_tau_", i, " ", trim(para%c_hmc(i)%ntau) + write(UREC, fmt) "traj_length_", i, " ", trim(para%c_hmc(i)%traj_length) + write(UREC, fmt) "rho_", i, " ", trim(para%c_hmc(i)%rho) + write(UREC, fmt) "m_scale_", i, " ", trim(para%c_hmc(i)%m_scale) + enddo + + write(UREC, *) "HMC_model ", para%hmc(1)%model + write(UREC, *) "REAL_kind ", RKIND + write(UREC, 405) "CG_rest ", trim(para%c_cg_rest) + write(UREC, *) "MRE_vectors ", mre_n_vec + + call end_A(UREC, "Header") + +400 format (3(1x,a)) +405 format (2(1x,a)) +410 format (1x,a,4i3) + + endif + +end subroutine write_header + +!------------------------------------------------------------------------------- +subroutine write_footer(time0) + + use module_function_decl + use module_thread + implicit none + + SEED :: seed + SECONDS :: time0, sekunden + + call ranget(seed) + + call begin(UREC, "Footer") + + if (my_pe() == 0) then + write(UREC, 400) "Date", datum(), uhrzeit() + write(UREC, *) "Seed", seed + write(UREC, 410) "CPU-Time", & + sekunden() - time0, "s on", num_pes() * n_thread, "CPUs" + endif + +400 format (3(1x,a)) +410 format (1x,a,1x,f8.1,1x,a,1x,i5,1x,a) + + TIMING_WRITE(UREC) + + call end_A(UREC, "Footer") + +end subroutine write_footer + +!------------------------------------------------------------------------------- +subroutine get_flags(flags) + + use typedef_cksum + use typedef_flags + use module_bqcd + use module_function_decl + use module_input + implicit none + + type(type_flags), intent(out) :: flags + + integer :: iarg, length, stat, narg + integer, external :: ipxfargc + character(len = 2) :: opt + + flags%continuation_job = .false. + flags%show_version = .false. + + narg = ipxfargc() + + iarg = 1 + do while (iarg <= narg) + call pxfgetarg(iarg, opt, length, stat) + + if (opt(1:1) == "-") then + if (length > 2) call usage() + + select case (opt(2:2)) + case ("c") + flags%continuation_job = .true. + iarg = iarg + 1 + case ("I") + call input_dump(6) + call comm_finalize() + stop + case ("V") + flags%show_version = .true. + iarg = iarg + 1 + case default + call usage + end select + else + exit + endif + enddo + + if (flags%show_version) then + call version() + call comm_finalize() + stop + endif + + call take_arg(iarg, flags%input, narg) + if (narg >= iarg) call usage + +CONTAINS + + subroutine usage() + implicit none + call die("Usage: " // prog_name // " [-c] [-I] [-V] input") + end subroutine usage + + subroutine version() + implicit none + + if (my_pe() == 0) then + write(6,*) "This is ", prog_name, " ", prog_version + write(6,*) " input format: ", input_version + write(6,*) " conf info format:", conf_info_version + write(6,*) " MAX_TEMPER: ", MAX_TEMPER + write(6,*) " real kind: ", RKIND + write(6,*) " version of D: ", version_of_d() + write(6,*) " D3: buffer vol: ", get_d3_buffer_vol() + write(6,*) " communication: ", trim(comm_method()) + endif + end subroutine version + + subroutine take_arg(iarg, arg, narg) + implicit none + integer, intent(inout) :: iarg + character(len = *), intent(out) :: arg + integer, intent(in) :: narg + integer :: length, stat + + if (iarg > narg) call usage + call pxfgetarg(iarg, arg, length, stat) + if (length > len(arg)) then + call die("get_flags(): " // arg // ": argument too long") + endif + iarg = iarg + 1 + end subroutine take_arg + +end subroutine get_flags + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/bqcd.pcl b/qcd/part_cpu/applications/QCD/src/kernel_A/bqcd.pcl new file mode 100644 index 0000000000000000000000000000000000000000..10905e9f70bd1b3bd9c17624efe4251d574c8400 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/bqcd.pcl @@ -0,0 +1,2 @@ +work.pc +modules/work.pc diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/cg.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/cg.F90 new file mode 100644 index 0000000000000000000000000000000000000000..8ab5d17ce98c328687c12ea6631da68d1e2e4837 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/cg.F90 @@ -0,0 +1,167 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! cg.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_cg + + type type_cg_para + real :: rest + integer :: maxiter + integer :: log + end type type_cg_para + + type(type_cg_para), save :: cg_para + + type type_cg_stat + integer :: niter + integer :: niter_max + integer :: niter_tot + integer :: ncall + end type type_cg_stat + + type(type_cg_stat), save :: cg_stat + + integer, save :: cg_iterations_total = 0 ! used in timing.F90 +end + +!------------------------------------------------------------------------------- +subroutine cg(matrix_mult, x, b, para, conf, iterations) + + ! solves "matrix_mult * x = b" and returns number of iterations + + use module_cg + use module_function_decl + use module_p_interface + use module_vol + use typedef_hmc + implicit none + + external :: matrix_mult + SPINCOL_OVERINDEXED, intent(out) :: x + SPINCOL_OVERINDEXED, intent(in) :: b + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + integer, intent(out) :: iterations + + P_SPINCOL_OVERINDEXED, save :: r, p, aap + + REAL :: ak, bk, rtr, rtrold, paap + integer :: i, niter + character(72) :: msg + + TIMING_START(timing_bin_cg) + + ALLOCATE_SC_OVERINDEXED(r) + ALLOCATE_SC_OVERINDEXED(p) + ALLOCATE_SC_OVERINDEXED(aap) + + call matrix_mult(r, x, para, conf) + + rtrold = ZERO + !$omp parallel do reduction(+: rtrold) + do i = 1, size_sc_field + r(i) = b(i) - r(i) + p(i) = r(i) + rtrold = rtrold + r(i)**2 + enddo + + rtrold = global_sum(rtrold) + + do niter = 1, cg_para%maxiter + call matrix_mult(aap, p, para, conf) + + paap = sc_dot(p, aap) + paap = global_sum(paap) + + ak = rtrold / paap + + rtr = ZERO + !$omp parallel do reduction(+: rtr) + do i = 1, size_sc_field + x(i) = x(i) + ak * p(i) + r(i) = r(i) - ak * aap(i) + rtr = rtr + r(i)**2 + enddo + + rtr = global_sum(rtr) + + if (rtr <= cg_para%rest) goto 9999 + + bk = rtr / rtrold + rtrold = rtr + + call sc_xpby(p, r, bk) ! p = r + bk * p + enddo + + niter = niter - 1 + + if (cg_para%log /= 2) then + write(msg, *) "cg(): no convergence; rtr = ", rtr + call die(msg) + endif + +9999 continue + + cg_stat%ncall = cg_stat%ncall + 1 + cg_stat%niter = niter + cg_stat%niter_max = max(cg_stat%niter_max, niter) + cg_stat%niter_tot = cg_stat%niter_tot + niter + cg_iterations_total = cg_iterations_total + niter + + iterations = niter + + TIMING_STOP(timing_bin_cg) +end + +!------------------------------------------------------------------------------- +subroutine init_cg_para(rest, maxiter, log) + + use module_cg + implicit none + real rest + integer maxiter, log + + cg_para%rest = rest + cg_para%maxiter = maxiter + cg_para%log = log + +end + +!------------------------------------------------------------------------------- +subroutine init_cg_stat() + + use module_cg + implicit none + + cg_stat%ncall = 0 + cg_stat%niter_max = 0 + cg_stat%niter_tot = 0 + +end + +!------------------------------------------------------------------------------- +subroutine get_cg_stat(ncall, niter_max, niter_tot) + + use module_cg + implicit none + integer ncall, niter_max, niter_tot + + ncall = cg_stat%ncall + niter_max = cg_stat%niter_max + niter_tot = cg_stat%niter_tot + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/checks.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/checks.F90 new file mode 100644 index 0000000000000000000000000000000000000000..8c8cd815dd6e1089ba95ef3aa3d80833755bc086 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/checks.F90 @@ -0,0 +1,60 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! checks.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine check_csw(beta, csw) + + implicit none + REAL, intent(in) :: beta, csw + REAL :: g, c + + if (beta == ZERO) return + if (csw == ZERO) return + + g = SIX / beta + c = ONE - 0.454 * g - 0.175 * g**2 + 0.012 * g**3 + 0.045 * g**4 + c = c / (ONE - 0.720 * g) + + if (abs(c - csw) > 0.00005) then + call warn("check_csw(): c_sw differs more than 0.00005 from ALPHA value") + endif + +end + +!------------------------------------------------------------------------------- +subroutine check_bc_fermions(bc_fermions, gamma_index) + + ! warns if the number of anti-periodic fermionic b.c. is 1 and + ! the anti-periodic direction is not the gamma_4 direction + + implicit none + integer, dimension(DIM), intent(in) :: bc_fermions, gamma_index + + integer :: i, i_anti, count + + count = 0 + do i = 1, DIM + if (bc_fermions(i) < 0) then + count = count + 1 + i_anti = i + endif + enddo + + if (count == 1 .and. gamma_index(i_anti) /= 4) then + call warn("check_bc_fermions(): anti-periodic b.c. not in gamma_4 direction") + endif + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/cksum.c b/qcd/part_cpu/applications/QCD/src/kernel_A/cksum.c new file mode 100644 index 0000000000000000000000000000000000000000..0b3c77470db5537b5b750e9a784a0474568b16ea --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/cksum.c @@ -0,0 +1,154 @@ +/* +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! Adopted from: +! +! ================== +! QCD SF/T3E PROGRAM +! ================== +! +! Calculate a modified cyclic redundancy check (CRC is specified by the +! POSIX.2 standard). Modification is necessary since I do not know how +! to handle a unsigned long in FORTRAN. Solution: CKSUM returns negative +! number if result is > LONG_MAX. +! +! CKSUM_GET() returns always positive numbers < LONG_MAX. (H.S.) +! +! +! Parts of the source come from cksum.c of the GNU text utilities (version +! 1.19) written by Q. Frank Xia. +! +! $Log: cksum.c,v $ +! Revision 1.1 2007/11/14 13:10:15 mallalen +! *** empty log message *** +! +! Revision 1.1 1997/12/04 10:31:09 pleiter +! Initial writing attempt +! +!----------------------------------------------------------------------------- +*/ + +#ifdef NamesToLower_ +# define CKSUM_INIT cksum_init_ +# define CKSUM_ADD cksum_add_ +# define CKSUM_GET cksum_get_ +#endif + +#ifdef NamesToLower +# define CKSUM_INIT cksum_init +# define CKSUM_ADD cksum_add +# define CKSUM_GET cksum_get +#endif + +#ifdef LongLong +# define INT8 long long +#else +# define INT8 long +#endif + +void CKSUM_INIT(void); +void CKSUM_ADD(void *, INT8 *); +void CKSUM_GET(INT8 *, INT8 *); + +static unsigned INT8 the_crc = 0; +static INT8 the_bytes = 0; + +static unsigned INT8 const crctab[256] = +{ + 0x0, + 0x04C11DB7, 0x09823B6E, 0x0D4326D9, 0x130476DC, 0x17C56B6B, + 0x1A864DB2, 0x1E475005, 0x2608EDB8, 0x22C9F00F, 0x2F8AD6D6, + 0x2B4BCB61, 0x350C9B64, 0x31CD86D3, 0x3C8EA00A, 0x384FBDBD, + 0x4C11DB70, 0x48D0C6C7, 0x4593E01E, 0x4152FDA9, 0x5F15ADAC, + 0x5BD4B01B, 0x569796C2, 0x52568B75, 0x6A1936C8, 0x6ED82B7F, + 0x639B0DA6, 0x675A1011, 0x791D4014, 0x7DDC5DA3, 0x709F7B7A, + 0x745E66CD, 0x9823B6E0, 0x9CE2AB57, 0x91A18D8E, 0x95609039, + 0x8B27C03C, 0x8FE6DD8B, 0x82A5FB52, 0x8664E6E5, 0xBE2B5B58, + 0xBAEA46EF, 0xB7A96036, 0xB3687D81, 0xAD2F2D84, 0xA9EE3033, + 0xA4AD16EA, 0xA06C0B5D, 0xD4326D90, 0xD0F37027, 0xDDB056FE, + 0xD9714B49, 0xC7361B4C, 0xC3F706FB, 0xCEB42022, 0xCA753D95, + 0xF23A8028, 0xF6FB9D9F, 0xFBB8BB46, 0xFF79A6F1, 0xE13EF6F4, + 0xE5FFEB43, 0xE8BCCD9A, 0xEC7DD02D, 0x34867077, 0x30476DC0, + 0x3D044B19, 0x39C556AE, 0x278206AB, 0x23431B1C, 0x2E003DC5, + 0x2AC12072, 0x128E9DCF, 0x164F8078, 0x1B0CA6A1, 0x1FCDBB16, + 0x018AEB13, 0x054BF6A4, 0x0808D07D, 0x0CC9CDCA, 0x7897AB07, + 0x7C56B6B0, 0x71159069, 0x75D48DDE, 0x6B93DDDB, 0x6F52C06C, + 0x6211E6B5, 0x66D0FB02, 0x5E9F46BF, 0x5A5E5B08, 0x571D7DD1, + 0x53DC6066, 0x4D9B3063, 0x495A2DD4, 0x44190B0D, 0x40D816BA, + 0xACA5C697, 0xA864DB20, 0xA527FDF9, 0xA1E6E04E, 0xBFA1B04B, + 0xBB60ADFC, 0xB6238B25, 0xB2E29692, 0x8AAD2B2F, 0x8E6C3698, + 0x832F1041, 0x87EE0DF6, 0x99A95DF3, 0x9D684044, 0x902B669D, + 0x94EA7B2A, 0xE0B41DE7, 0xE4750050, 0xE9362689, 0xEDF73B3E, + 0xF3B06B3B, 0xF771768C, 0xFA325055, 0xFEF34DE2, 0xC6BCF05F, + 0xC27DEDE8, 0xCF3ECB31, 0xCBFFD686, 0xD5B88683, 0xD1799B34, + 0xDC3ABDED, 0xD8FBA05A, 0x690CE0EE, 0x6DCDFD59, 0x608EDB80, + 0x644FC637, 0x7A089632, 0x7EC98B85, 0x738AAD5C, 0x774BB0EB, + 0x4F040D56, 0x4BC510E1, 0x46863638, 0x42472B8F, 0x5C007B8A, + 0x58C1663D, 0x558240E4, 0x51435D53, 0x251D3B9E, 0x21DC2629, + 0x2C9F00F0, 0x285E1D47, 0x36194D42, 0x32D850F5, 0x3F9B762C, + 0x3B5A6B9B, 0x0315D626, 0x07D4CB91, 0x0A97ED48, 0x0E56F0FF, + 0x1011A0FA, 0x14D0BD4D, 0x19939B94, 0x1D528623, 0xF12F560E, + 0xF5EE4BB9, 0xF8AD6D60, 0xFC6C70D7, 0xE22B20D2, 0xE6EA3D65, + 0xEBA91BBC, 0xEF68060B, 0xD727BBB6, 0xD3E6A601, 0xDEA580D8, + 0xDA649D6F, 0xC423CD6A, 0xC0E2D0DD, 0xCDA1F604, 0xC960EBB3, + 0xBD3E8D7E, 0xB9FF90C9, 0xB4BCB610, 0xB07DABA7, 0xAE3AFBA2, + 0xAAFBE615, 0xA7B8C0CC, 0xA379DD7B, 0x9B3660C6, 0x9FF77D71, + 0x92B45BA8, 0x9675461F, 0x8832161A, 0x8CF30BAD, 0x81B02D74, + 0x857130C3, 0x5D8A9099, 0x594B8D2E, 0x5408ABF7, 0x50C9B640, + 0x4E8EE645, 0x4A4FFBF2, 0x470CDD2B, 0x43CDC09C, 0x7B827D21, + 0x7F436096, 0x7200464F, 0x76C15BF8, 0x68860BFD, 0x6C47164A, + 0x61043093, 0x65C52D24, 0x119B4BE9, 0x155A565E, 0x18197087, + 0x1CD86D30, 0x029F3D35, 0x065E2082, 0x0B1D065B, 0x0FDC1BEC, + 0x3793A651, 0x3352BBE6, 0x3E119D3F, 0x3AD08088, 0x2497D08D, + 0x2056CD3A, 0x2D15EBE3, 0x29D4F654, 0xC5A92679, 0xC1683BCE, + 0xCC2B1D17, 0xC8EA00A0, 0xD6AD50A5, 0xD26C4D12, 0xDF2F6BCB, + 0xDBEE767C, 0xE3A1CBC1, 0xE760D676, 0xEA23F0AF, 0xEEE2ED18, + 0xF0A5BD1D, 0xF464A0AA, 0xF9278673, 0xFDE69BC4, 0x89B8FD09, + 0x8D79E0BE, 0x803AC667, 0x84FBDBD0, 0x9ABC8BD5, 0x9E7D9662, + 0x933EB0BB, 0x97FFAD0C, 0xAFB010B1, 0xAB710D06, 0xA6322BDF, + 0xA2F33668, 0xBCB4666D, 0xB8757BDA, 0xB5365D03, 0xB1F740B4 +}; + +void CKSUM_INIT(void) +{ + the_crc = 0; + the_bytes = 0; +} + +void CKSUM_ADD(void *memptr, INT8 *nbytes) +{ + register unsigned INT8 crc; + register INT8 i; + register unsigned char *cp; + + crc = the_crc; + cp = (unsigned char *) memptr; + for (i=0; i < *nbytes; i++) + crc = (crc << 8) ^ crctab[((crc >> 24) ^ *(cp++)) & 0xFF]; + + the_crc = crc; + the_bytes += *nbytes; +} + +void CKSUM_GET(INT8 *total_crc, INT8 *total_bytes) +{ + register unsigned INT8 crc; + register INT8 i; + + crc = the_crc; + + for (i = the_bytes; i > 0; i >>= 8) + crc = (crc << 8) ^ crctab[((crc >> 24) ^ i) & 0xFF]; + crc = (~crc & 0xFFFFFFFF); + + *total_crc = (INT8) crc; + *total_bytes = the_bytes; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/cksum_dummy.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/cksum_dummy.F90 new file mode 100644 index 0000000000000000000000000000000000000000..97fb8fda1b92d63dae060068a61611037a64fb39 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/cksum_dummy.F90 @@ -0,0 +1,35 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! cksum_dummy.F90 - dummy routines that can replace the real C routines +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine cksum_init() + return +end + +!------------------------------------------------------------------------------- +subroutine cksum_add(i, j) + integer i(*) + CHECK_SUM j + return +end + +!------------------------------------------------------------------------------- +subroutine cksum_get(sum, bytes) + CHECK_SUM sum, bytes + sum = 0 + bytes = 0 +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..445b413cf94079e033efb4c609177f59e8c69ea8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/Makefile @@ -0,0 +1,67 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# clover/Makefile +# +#=============================================================================== + +include ../Makefile.defs + +fpp = $(FPP) -I.. $(FPPFLAGS) + +MODULES_DIR = ../modules + +.SUFFIXES: +.SUFFIXES: .a .o .F90 + +.F90.o: + $(fpp) $< > $*.f90 + $(F90) -c $(FFLAGS) -I$(MODULES_DIR) $*.f90 + +OBJS = \ + clover_action.o \ + clover_allocate.o \ + clover_bsa.o \ + clover_d.o \ + clover_f_mu_nu.o \ + clover_init.o \ + clover_inv.o \ + clover_mult_a.o \ + clover_mult_ao.o \ + clover_mult_b.o \ + clover_t_init.o \ + clover_ts.o \ + clover_uuu.o \ + clover_uuuu.o + +OBJS_CTEST = \ + ctest.o \ + clover_inv.o \ + clover_mult_a.o \ + clover_mult_ao.o \ + clover_mult_b.o + +$(LIBCLOVER): + +libclover.a: $(OBJS) + $(AR) $(ARFLAGS) $@ $(OBJS) + $(RANLIB) $@ + +fast: + $(FAST_MAKE) + +ctest: $(OBJS_CTEST) + f90 -o $@ $(OBJS_CTEST) + +clean: + rm -f *.[Tiod] *.f90 *.mod core work.pc work.pcl + +clobber: clean + rm -f libclover.a ctest diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/bqcd.pcl b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/bqcd.pcl new file mode 100644 index 0000000000000000000000000000000000000000..906244500b31700684482c3dcfd32f6cec4279db --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/bqcd.pcl @@ -0,0 +1,2 @@ +work.pc +../modules/work.pc diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover.h b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover.h new file mode 100644 index 0000000000000000000000000000000000000000..9f22a2453f2aaceb37ffb33d9d8f03b4e08f0ac5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover.h @@ -0,0 +1,166 @@ +#ifdef CLOVER_AS_COMPLEX_ARRAY + +# define A11 Re(a(1,J,i)) +# define A22 Im(a(1,J,i)) +# define A33 Re(a(11,J,i)) +# define A44 Im(a(11,J,i)) +# define A55 Re(a(17,J,i)) +# define A66 Im(a(17,J,i)) + +# define A12 a(2,J,i) +# define A13 a(3,J,i) +# define A14 a(4,J,i) +# define A15 a(5,J,i) +# define A16 a(6,J,i) + +# define A23 a(7,J,i) +# define A24 a(8,J,i) +# define A25 a(9,J,i) +# define A26 a(10,J,i) + +# define A34 a(12,J,i) +# define A35 a(13,J,i) +# define A36 a(14,J,i) + +# define A45 a(15,J,i) +# define A46 a(16,J,i) + +# define A56 a(18,J,i) + +# define B11 Re(b(16,J,i)) +# define B22 Im(b(16,J,i)) +# define B33 Re(b(17,J,i)) +# define B44 Im(b(17,J,i)) +# define B55 Re(b(18,J,i)) +# define B66 Im(b(18,J,i)) + +# define B21 b(1,J,i) + +# define B31 b(2,J,i) +# define B32 b(3,J,i) + +# define B41 b(4,J,i) +# define B42 b(5,J,i) +# define B43 b(6,J,i) + +# define B51 b(7,J,i) +# define B52 b(8,J,i) +# define B53 b(9,J,i) +# define B54 b(10,J,i) + +# define B61 b(11,J,i) +# define B62 b(12,J,i) +# define B63 b(13,J,i) +# define B64 b(14,J,i) +# define B65 b(15,J,i) + +#else + +# define A11 a%i11 +# define A22 a%i22 +# define A33 a%i33 +# define A44 a%i44 +# define A55 a%i55 +# define A66 a%i66 + +# define A12 a%i12 +# define A13 a%i13 +# define A14 a%i14 +# define A15 a%i15 +# define A16 a%i16 + +# define A23 a%i23 +# define A24 a%i24 +# define A25 a%i25 +# define A26 a%i26 + +# define A34 a%i34 +# define A35 a%i35 +# define A36 a%i36 + +# define A45 a%i45 +# define A46 a%i46 + +# define A56 a%i56 + +# define B11 b%i11 +# define B22 b%i22 +# define B33 b%i33 +# define B44 b%i44 +# define B55 b%i55 +# define B66 b%i66 + +# define B21 b%i21 + +# define B31 b%i31 +# define B32 b%i32 + +# define B41 b%i41 +# define B42 b%i42 +# define B43 b%i43 + +# define B51 b%i51 +# define B52 b%i52 +# define B53 b%i53 +# define B54 b%i54 + +# define B61 b%i61 +# define B62 b%i62 +# define B63 b%i63 +# define B64 b%i64 +# define B65 b%i65 + +#endif + +# define A21 conjg(A12) +# define A31 conjg(A13) +# define A41 conjg(A14) +# define A51 conjg(A15) +# define A61 conjg(A16) + +# define A32 conjg(A23) +# define A42 conjg(A24) +# define A52 conjg(A25) +# define A62 conjg(A26) + +# define A43 conjg(A34) +# define A53 conjg(A35) +# define A63 conjg(A36) + +# define A54 conjg(A45) +# define A64 conjg(A46) + +# define A65 conjg(A56) + +# define B12 conjg(B21) + +# define B13 conjg(B31) +# define B23 conjg(B32) + +# define B14 conjg(B41) +# define B24 conjg(B42) +# define B34 conjg(B43) + +# define B15 conjg(B51) +# define B25 conjg(B52) +# define B35 conjg(B53) +# define B45 conjg(B54) + +# define B16 conjg(B61) +# define B26 conjg(B62) +# define B36 conjg(B63) +# define B46 conjg(B64) +# define B56 conjg(B65) + +# define SC1 1, 1 +# define SC2 1, 2 +# define SC3 1, 3 +# define SC4 2, 1 +# define SC5 2, 2 +# define SC6 2, 3 +# define SC7 3, 1 +# define SC8 3, 2 +# define SC9 3, 3 +# define SC10 4, 1 +# define SC11 4, 2 +# define SC12 4, 3 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_action.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_action.F90 new file mode 100644 index 0000000000000000000000000000000000000000..a9fbd0450cbba703fd61728ca1a91fd003fead18 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_action.F90 @@ -0,0 +1,51 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! clover_action.F90 - calculates: -2 Tr(log(T_oo)) +! +!------------------------------------------------------------------------------- +# include "defs.h" +# include "clover.h" + +!------------------------------------------------------------------------------- +REAL function clover_action(b) + + use typedef_clover + use module_vol + implicit none + + type(type_clover_b) :: b(2, volh) + integer :: i + REAL :: s, global_sum + + + s = ZERO + + !$omp parallel do reduction(+: s) + do i = 1, volh + s = s + log(det(b(1, i)) * det(b(2, i))) + enddo + + clover_action = TWO * global_sum(s) + + +CONTAINS + + REAL function det(b) ! returns (1 / det) + + type(type_clover_b) :: b + + det = B11 * B22 * B33 * B44 * B55 * B66 + + end function det + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_allocate.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_allocate.F90 new file mode 100644 index 0000000000000000000000000000000000000000..60475de54ce8caf5937248dac933d897ec09de9a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_allocate.F90 @@ -0,0 +1,40 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! clover_allocate.F90 - allocation of clover arrays +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine allocate_clover_field_a(a) + + use typedef_clover + use module_vol + implicit none + P_CLOVER_FIELD_A :: a + + allocate(a(2, volh, EVEN:ODD)) + +end + +!------------------------------------------------------------------------------- +subroutine allocate_clover_field_b(b) + + use typedef_clover + use module_vol + implicit none + P_CLOVER_FIELD_B :: b + + allocate(b(2, volh, EVEN:ODD)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa.F90 new file mode 100644 index 0000000000000000000000000000000000000000..499abb10c889fd6d8c6b673a8fdd676a0e6a5f6c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa.F90 @@ -0,0 +1,165 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! clover_bsa.F90 - calculates "B sigma A" +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine clover_bsa(mu, nu, w, b, a) ! w = transposed(conjg(b) sigma_mu_nu a) + + use module_vol + implicit none + + integer :: mu, nu + SU3_FIELD :: w + SPINCOL_FIELD :: b, a + + if (mu == 1) then + if (nu == 2) then ; call clover_bsa_12(w, b, a) + elseif (nu == 3) then ; call clover_bsa_13(w, b, a) + elseif (nu == 4) then ; call clover_bsa_14(w, b, a) ; endif + elseif (mu == 2) then + if (nu == 1) then ; call clover_bsa_21(w, b, a) + elseif (nu == 3) then ; call clover_bsa_23(w, b, a) + elseif (nu == 4) then ; call clover_bsa_24(w, b, a) ; endif + elseif (mu == 3) then + if (nu == 1) then ; call clover_bsa_31(w, b, a) + elseif (nu == 2) then ; call clover_bsa_32(w, b, a) + elseif (nu == 4) then ; call clover_bsa_34(w, b, a) ; endif + elseif (mu == 4) then + if (nu == 1) then ; call clover_bsa_41(w, b, a) + elseif (nu == 2) then ; call clover_bsa_42(w, b, a) + elseif (nu == 3) then ; call clover_bsa_43(w, b, a) ; endif + endif +end + +!------------------------------------------------------------------------------- +subroutine clover_bsa_12(w, b, a) + +# include "clover_bsa_head.h90" + a1 = -a(1, ca, i) + a2 = a(2, ca, i) + a3 = -a(3, ca, i) + a4 = a(4, ca, i) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_21(w, b, a) + +# include "clover_bsa_head.h90" + a1 = a(1, ca, i) + a2 = -a(2, ca, i) + a3 = a(3, ca, i) + a4 = -a(4, ca, i) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_13(w, b, a) + +# include "clover_bsa_head.h90" + a1 = -i_times(a(2, ca, i)) + a2 = i_times(a(1, ca, i)) + a3 = -i_times(a(4, ca, i)) + a4 = i_times(a(3, ca, i)) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_31(w, b, a) + +# include "clover_bsa_head.h90" + a1 = i_times(a(2, ca, i)) + a2 = -i_times(a(1, ca, i)) + a3 = i_times(a(4, ca, i)) + a4 = -i_times(a(3, ca, i)) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_14(w, b, a) + +# include "clover_bsa_head.h90" + a1 = a(4, ca, i) + a2 = a(3, ca, i) + a3 = a(2, ca, i) + a4 = a(1, ca, i) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_41(w, b, a) + +# include "clover_bsa_head.h90" + a1 = -a(4, ca, i) + a2 = -a(3, ca, i) + a3 = -a(2, ca, i) + a4 = -a(1, ca, i) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_23(w, b, a) + +# include "clover_bsa_head.h90" + a1 = -a(2, ca, i) + a2 = -a(1, ca, i) + a3 = -a(4, ca, i) + a4 = -a(3, ca, i) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_32(w, b, a) + +# include "clover_bsa_head.h90" + a1 = a(2, ca, i) + a2 = a(1, ca, i) + a3 = a(4, ca, i) + a4 = a(3, ca, i) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_24(w, b, a) + +# include "clover_bsa_head.h90" + a1 = -i_times(a(4, ca, i)) + a2 = i_times(a(3, ca, i)) + a3 = -i_times(a(2, ca, i)) + a4 = i_times(a(1, ca, i)) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_42(w, b, a) + +# include "clover_bsa_head.h90" + a1 = i_times(a(4, ca, i)) + a2 = -i_times(a(3, ca, i)) + a3 = i_times(a(2, ca, i)) + a4 = -i_times(a(1, ca, i)) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_34(w, b, a) + +# include "clover_bsa_head.h90" + a1 = a(3, ca, i) + a2 = -a(4, ca, i) + a3 = a(1, ca, i) + a4 = -a(2, ca, i) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_43(w, b, a) + +# include "clover_bsa_head.h90" + a1 = -a(3, ca, i) + a2 = a(4, ca, i) + a3 = -a(1, ca, i) + a4 = a(2, ca, i) +# include "clover_bsa_tail.h90" + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa_head.h90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa_head.h90 new file mode 100644 index 0000000000000000000000000000000000000000..b2dae802a86eb8c4376fa6ac835510fd28849878 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa_head.h90 @@ -0,0 +1,16 @@ + use module_vol + implicit none + + SU3_FIELD :: w + SPINCOL_FIELD :: b, a + COMPLEX :: a1, a2, a3, a4 + integer :: i, ca, cb + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + !$omp parallel do private(ca, cb, a1, a2, a3, a4) + do i = 1, volh + do ca = 1, NCOL diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa_tail.h90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa_tail.h90 new file mode 100644 index 0000000000000000000000000000000000000000..66092341dae7a1266365740e4405af306c5c8abc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa_tail.h90 @@ -0,0 +1,10 @@ + do cb = 1, NCOL + w(ca, cb, i) = a1 * conjg(b(1, cb, i)) & + + a2 * conjg(b(2, cb, i)) & + + a3 * conjg(b(3, cb, i)) & + + a4 * conjg(b(4, cb, i)) + enddo + enddo + enddo + +end diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_d.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_d.F90 new file mode 100644 index 0000000000000000000000000000000000000000..71611a1c43b24aebdaa31bc1668f3b33ecaf1587 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_d.F90 @@ -0,0 +1,255 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! clover_d.F90 - derivative of clover term +! +!------------------------------------------------------------------------------- +! +! E -- 2 -- B +! | | A = x +! 3 1 ^ nu B = x + mu^ + nu^ +! | | | C = x + mu^ - nu^ +! A -- 0 -- D x --> mu +! | | D = x + mu^ +! 4 6 E = x + nu^ +! | | F = x - nu^ +! F -- 5 -- C +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine clover_dsd(eo, p, b, s, u) + + use typedef_clover + use module_p_interface + use module_vol + implicit none + + integer :: eo ! EVEN/ODD property of "b" + GENERATOR_FIELD :: p + CLOVER_FIELD_B :: b + REAL :: s + GAUGE_FIELD :: u + + P_GAUGE_FIELD, save :: w ! use existing data structure + + CLOVER_FIELD_C :: t +!dir$ cache_align t + + + TIMING_START(timing_bin_clover_dsd) + + ALLOCATE_G_FIELD(w) + + call clover_t_init(t, b(1, 1, eo)) + + call clover_ts(1, 2, w(1, 1, 1, EVEN, 1), t) ; call xbound_g(w, EVEN, 1) + call clover_ts(1, 3, w(1, 1, 1, ODD, 1), t) ; call xbound_g(w, ODD, 1) + call clover_ts(1, 4, w(1, 1, 1, EVEN, 2), t) ; call xbound_g(w, EVEN, 2) + call clover_ts(2, 3, w(1, 1, 1, ODD, 2), t) ; call xbound_g(w, ODD, 2) + call clover_ts(2, 4, w(1, 1, 1, EVEN, 3), t) ; call xbound_g(w, EVEN, 3) + call clover_ts(3, 4, w(1, 1, 1, ODD, 3), t) ; call xbound_g(w, ODD, 3) + + call clover_d(eo, p, s, u, w) + + TIMING_STOP(timing_bin_clover_dsd) +end + +!------------------------------------------------------------------------------- +subroutine clover_dsf(eo, p, b, a, s, u) + + use module_p_interface + use module_vol + implicit none + + integer :: eo ! EVEN/ODD property of "b" and "a" + GENERATOR_FIELD :: p + SPINCOL_FIELD :: b, a + REAL :: s + GAUGE_FIELD :: u + + P_GAUGE_FIELD, save :: w ! use existing data structure + + + TIMING_START(timing_bin_clover_dsf) + + ALLOCATE_G_FIELD(w) + + call clover_bsa(1, 2, w(1, 1, 1, EVEN, 1), b, a) ; call xbound_g(w, EVEN, 1) + call clover_bsa(1, 3, w(1, 1, 1, ODD, 1), b, a) ; call xbound_g(w, ODD, 1) + call clover_bsa(1, 4, w(1, 1, 1, EVEN, 2), b, a) ; call xbound_g(w, EVEN, 2) + call clover_bsa(2, 3, w(1, 1, 1, ODD, 2), b, a) ; call xbound_g(w, ODD, 2) + call clover_bsa(2, 4, w(1, 1, 1, EVEN, 3), b, a) ; call xbound_g(w, EVEN, 3) + call clover_bsa(3, 4, w(1, 1, 1, ODD, 3), b, a) ; call xbound_g(w, ODD, 3) + + call clover_d(eo, p, s, u, w) + + TIMING_STOP(timing_bin_clover_dsf) +end + +!------------------------------------------------------------------------------- +subroutine clover_d(eo, p, s, u, w) + + use module_vol + implicit none + + integer :: eo + GENERATOR_FIELD :: p + REAL :: s + GAUGE_FIELD :: u, w + + call clover_d_mu_nu(eo, 1, 2, p, s, u, w(1, 1, 1, EVEN, 1)) + call clover_d_mu_nu(eo, 1, 3, p, s, u, w(1, 1, 1, ODD, 1)) + call clover_d_mu_nu(eo, 1, 4, p, s, u, w(1, 1, 1, EVEN, 2)) + call clover_d_mu_nu(eo, 2, 3, p, s, u, w(1, 1, 1, ODD, 2)) + call clover_d_mu_nu(eo, 2, 4, p, s, u, w(1, 1, 1, EVEN, 3)) + call clover_d_mu_nu(eo, 3, 4, p, s, u, w(1, 1, 1, ODD, 3)) + +end + +!------------------------------------------------------------------------------- +subroutine clover_d_mu_nu(e, mu, nu, p, s, u, w) + + use module_vol + implicit none + + integer :: e, o, mu, nu + GENERATOR_FIELD :: p + REAL :: s + GAUGE_FIELD :: u + SU3_FIELD :: w + external clover_d_same_eo, clover_d_diff_eo + + o = EVEN + ODD - e + + call clover_d_loop(e, mu, nu, p, s, u, w, clover_d_same_eo) + call clover_d_loop(e, nu, mu, p, -s, u, w, clover_d_same_eo) + call clover_d_loop(o, mu, nu, p, s, u, w, clover_d_diff_eo) + call clover_d_loop(o, nu, mu, p, -s, u, w, clover_d_diff_eo) + +end + +!------------------------------------------------------------------------------- +subroutine clover_d_loop(e, mu, nu, p, s, u, w, clover_dd) + + use module_vol + use module_nn + implicit none + + integer :: e, mu, nu + GENERATOR_FIELD :: p + REAL :: s + GAUGE_FIELD :: u + SU3_FIELD :: w + external clover_dd + + integer :: o, i, ia, ib, ic, id, ie, if, j + GENERATOR :: q + + o = EVEN + ODD - e + + !$omp parallel do private(ia, ib, ic, id, ie, if, j, q) + do i = 1, volh + + id = nn(i, e, mu, FWD) + ie = nn(i, e, nu, FWD) + if = nn(i, e, nu, BWD) + + ia = i + ib = nn(id, o, nu, FWD) + ic = nn(id, o, nu, BWD) + + call clover_dd(q, s, u(1, 1, ia, e, mu), & + u(1, 1, id, o, nu), & + u(1, 1, ie, o, mu), & + u(1, 1, ia, e, nu), & + u(1, 1, if, o, nu), & + u(1, 1, if, o, mu), & + u(1, 1, ic, e, nu), & + w(1, 1, ia), & + w(1, 1, ib), & + w(1, 1, ic), & + w(1, 1, id), & + w(1, 1, ie), & + w(1, 1, if)) + + do j = 1, NGEN + p(j, i, e, mu) = p(j, i, e, mu) + q(j) + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine clover_d_same_eo(p, s, u0,u1,u2,u3,u4,u5,u6, wa,wb,wc, wd,we,wf) + + implicit none + + GENERATOR :: p + REAL :: s + SU3 :: u0, u1, u2, u3, u4, u5, u6, wa, wb, wc, wd, we, wf + SU3 :: r, u, v + + p = ZERO + + call uu(u, u0, u1) + call uu(v, u3, u2) + + call clover_uuu_udu(r, u, v, wa) ; call re_tr_j(p, r, s) + call clover_uuu_uud(r, wa, v, u) ; call re_tr_j(p, r, s) + call clover_uuu_uud(r, u, wb, v) ; call re_tr_j(p, r, s) + call clover_uuu_uud(r, v, wb, u) ; call re_tr_j(p, r, s) + + call uud(u, u0, u6) + call udu(v, u4, u5) + + call clover_uuu_uud(r, wa, v, u) ; call re_tr_j(p, r, -s) + call clover_uuu_udu(r, u, v, wa) ; call re_tr_j(p, r, -s) + call clover_uuu_uud(r, v, wc, u) ; call re_tr_j(p, r, -s) + call clover_uuu_uud(r, u, wc, v) ; call re_tr_j(p, r, -s) + +end + +!------------------------------------------------------------------------------- +subroutine clover_d_diff_eo(p, s, u0,u1,u2,u3,u4,u5,u6, wa,wb,wc, wd,we,wf) + + implicit none + + GENERATOR :: p + REAL :: s + SU3 :: u0, u1, u2, u3, u4, u5, u6, wa, wb, wc, wd, we, wf + SU3 :: r, u, v + + p = ZERO + + u = ZERO + v = ZERO + call uuu_fwd(u, u1, u2, u3) + call uuu_fwd(v, u2, u1, u0) + + call clover_uuu_uuu(r, u0, wd, u) ; call re_tr_j(p, r, s) + call clover_uuu_dud(r, u, wd, u0) ; call re_tr_j(p, r, s) + call clover_uuu_dud(r, v, we, u3) ; call re_tr_j(p, r, s) + call clover_uuu_uuu(r, u3, we, v) ; call re_tr_j(p, r, s) + + u = ZERO + v = ZERO + call uuu_bwd(u, u6, u5, u4) + call uuu_fwd(v, u0, u6, u5) + + call clover_uuu_dud(r, u, wd, u0) ; call re_tr_j(p, r, -s) + call clover_uuu_uuu(r, u0, wd, u) ; call re_tr_j(p, r, -s) + call clover_uuu_dud(r, u4, wf, v) ; call re_tr_j(p, r, -s) + call clover_uuu_uuu(r, v, wf, u4) ; call re_tr_j(p, r, -s) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_dummy.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_dummy.F90 new file mode 100644 index 0000000000000000000000000000000000000000..bcbe8b7de98fe3e0c379bef7bfaf82cfdfc86fa8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_dummy.F90 @@ -0,0 +1,136 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2000-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! clover_dummy.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine allocate_clover_field_a(a) + + use typedef_clover + use module_vol + implicit none + P_CLOVER_FIELD_A :: a + + call die("allocate_clover_field_a(): must not be called.") +end + +!------------------------------------------------------------------------------- +subroutine allocate_clover_field_b(b) + + use typedef_clover + use module_vol + implicit none + P_CLOVER_FIELD_B :: b + + call die("allocate_clover_field_b(): must not be called.") +end + +!------------------------------------------------------------------------------- +subroutine clover_init(a, b, u, csw_kappa) + + use typedef_clover + use module_vol + implicit none + + CLOVER_FIELD_A, intent(out) :: a + CLOVER_FIELD_B, intent(out) :: b + GAUGE_FIELD, intent(in) :: u + REAL, intent(in) :: csw_kappa + + call die("clover_init(): must not be called.") +end + +!------------------------------------------------------------------------------- +REAL function clover_action(b) + + use typedef_clover + use module_vol + implicit none + + type(type_clover_b) :: b(2, volh) + integer :: i + REAL :: s, global_sum + + + call die("clover_action(): must not be called.") + clover_action = ZERO +end + +!------------------------------------------------------------------------------- +subroutine clover_mult_a(out, a, in, volh) + + implicit none + + COMPLEX, dimension(18, 2, *) :: a + COMPLEX, dimension(NDIRAC, NCOL, *) :: out, in + integer :: volh + + call die("clover_mult_a(): must not be called.") +end + +!------------------------------------------------------------------------------- +subroutine clover_mult_ao(a, x, volh) ! x := A x + + implicit none + + COMPLEX, dimension(18, 2, *) :: a + COMPLEX, dimension(NDIRAC, NCOL, *) :: x + integer :: volh + + call die("clover_mult_ao(): must not be called.") +end + +!------------------------------------------------------------------------------- +subroutine clover_mult_b(b, x, volh) + + implicit none + + COMPLEX, dimension(18, 2, *) :: b + COMPLEX, dimension(NDIRAC, NCOL, *) :: x + integer :: volh + + call die("clover_mult_b(): must not be called.") +end + +!------------------------------------------------------------------------------- +subroutine clover_dsd(eo, p, b, s, u) + + use typedef_clover + use module_vol + implicit none + + integer :: eo + GENERATOR_FIELD :: p + CLOVER_FIELD_B :: b + REAL :: s + GAUGE_FIELD :: u + + call die("clover_dsd(): must not be called.") +end + +!------------------------------------------------------------------------------- +subroutine clover_dsf(eo, p, b, a, s, u) + + use module_vol + implicit none + + integer :: eo + GENERATOR_FIELD :: p + SPINCOL_FIELD :: b, a + REAL :: s + GAUGE_FIELD :: u + + call die("clover_dsf(): must not be called.") +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_f_mu_nu.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_f_mu_nu.F90 new file mode 100644 index 0000000000000000000000000000000000000000..12a0fef130e5e1986f8b65647689f7ad38702b26 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_f_mu_nu.F90 @@ -0,0 +1,94 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! clover_f_mu_nu.F90 - F_mu_nu = (Q_mu_nu - h.c.) / i (missing factor 1/8) +! +!------------------------------------------------------------------------------- +! +! ^ nu +! xmp x_p (xpp) | +! | +! xm_ x xp_ x --> mu +! +! xmm x_m xpm +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine clover_f_mu_nu(f, mu, nu, x, e, u) + + use module_vol + use module_nn + implicit none + + SU3, intent(out) :: f + integer, intent(in) :: mu, nu, x, e + GAUGE_FIELD, intent(in) :: u + + integer :: xmp, x_p, xm_, xp_, xmm, x_m, xpm, o + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + o = EVEN + ODD - e + + xp_ = nn(x, e, mu, FWD) + xm_ = nn(x, e, mu, BWD) + x_p = nn(x, e, nu, FWD) + x_m = nn(x, e, nu, BWD) + + xmp = nn(xm_, o, nu, FWD) + xmm = nn(xm_, o, nu, BWD) + xpm = nn(xp_, o, nu, BWD) + + if (xmp /= nn(x_p, o, mu, BWD)) call die("colver_f_mu_nu(): xmp") + if (xmm /= nn(x_m, o, mu, BWD)) call die("colver_f_mu_nu(): xmm") + if (xpm /= nn(x_m, o, mu, FWD)) call die("colver_f_mu_nu(): xpm") + + f = ZERO + + call clover_uuuu1(f, u(1, 1, x, e, mu), & + u(1, 1, xp_, o, nu), & + u(1, 1, x_p, o, mu), & + u(1, 1, x, e, nu)) + + call clover_uuuu2(f, u(1, 1, x, e, nu), & + u(1, 1, xmp, e, mu), & + u(1, 1, xm_, o, nu), & + u(1, 1, xm_, o, mu)) + + call clover_uuuu3(f, u(1, 1, xm_, o, mu), & + u(1, 1, xmm, e, nu), & + u(1, 1, xmm, e, mu), & + u(1, 1, x_m, o, nu)) + + call clover_uuuu4(f, u(1, 1, x_m, o, nu), & + u(1, 1, x_m, o, mu), & + u(1, 1, xpm, e, nu), & + u(1, 1, x, e, mu)) + + f(1, 1) = cmplx(TWO * Im(f(1, 1)), ZERO) + f(2, 2) = cmplx(TWO * Im(f(2, 2)), ZERO) + f(3, 3) = cmplx(TWO * Im(f(3, 3)), ZERO) + + f(1, 2) = i_times(conjg(f(2, 1)) - f(1, 2)) + f(1, 3) = i_times(conjg(f(3, 1)) - f(1, 3)) + f(2, 3) = i_times(conjg(f(3, 2)) - f(2, 3)) + + f(2, 1) = conjg(f(1, 2)) + f(3, 1) = conjg(f(1, 3)) + f(3, 2) = conjg(f(2, 3)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_init.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_init.F90 new file mode 100644 index 0000000000000000000000000000000000000000..8a7b62b1d3853d44f728db2c383a5ba92bf248e5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_init.F90 @@ -0,0 +1,180 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! clover_init.F90 - calculates clover matrix and its inverse +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine clover_init(a, ainv, b, u, csw_kappa) + + use typedef_clover + use module_vol + implicit none + + CLOVER_FIELD_A, intent(out) :: a, ainv + CLOVER_FIELD_B, intent(out) :: b + GAUGE_FIELD, intent(in) :: u + REAL, intent(in) :: csw_kappa + + integer :: i, eo + SU3 :: f, g + type(type_clover_a) :: p, q + REAL :: factor + + TIMING_START(timing_bin_clover_init) + + factor = -csw_kappa / EIGHT + + do eo = EVEN, ODD + !$omp parallel do private(f, g, p, q) + do i = 1, VOLH + call clover_f_mu_nu(f, 2, 1, i, eo, u) + + call clover_init1(p, f) + + call clover_f_mu_nu(f, 3, 2, i, eo, u) + call clover_f_mu_nu(g, 3, 1, i, eo, u) + + call clover_init2(p, f, g) + + call clover_f_mu_nu(f, 3, 4, i, eo, u) + + call clover_init1(q, f) + + call clover_f_mu_nu(f, 1, 4, i, eo, u) + call clover_f_mu_nu(g, 4, 2, i, eo, u) + + call clover_init2(q, f, g) + + call clover_init3(a(1, i, eo), a(2, i, eo), p, q, factor) + + call clover_inv(b(1, i, eo), ainv(1, i, eo), a(1, i, eo)) + call clover_inv(b(2, i, eo), ainv(2, i, eo), a(2, i, eo)) + enddo + enddo + + TIMING_STOP(timing_bin_clover_init) + +end + +!------------------------------------------------------------------------------- +subroutine clover_init1(a, f) + + use typedef_clover + implicit none + type(type_clover_a) :: a + SU3 :: f + + a%i11 = Re(f(1, 1)) + a%i22 = Re(f(2, 2)) + a%i33 = Re(f(3, 3)) + + a%i44 = -a%i11 + a%i55 = -a%i22 + a%i66 = -a%i33 + + a%i12 = f(1, 2) + a%i13 = f(1, 3) + a%i23 = f(2, 3) + + a%i45 = -a%i12 + a%i46 = -a%i13 + a%i56 = -a%i23 + +end + +!------------------------------------------------------------------------------- +subroutine clover_init2(a, f, g) + + use typedef_clover + implicit none + type(type_clover_a) :: a + SU3 :: f, g + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + a%i14 = f(1, 1) + i_times(g(1, 1)) + a%i15 = f(1, 2) + i_times(g(1, 2)) + a%i16 = f(1, 3) + i_times(g(1, 3)) + + a%i24 = f(2, 1) + i_times(g(2, 1)) + a%i25 = f(2, 2) + i_times(g(2, 2)) + a%i26 = f(2, 3) + i_times(g(2, 3)) + + a%i34 = f(3, 1) + i_times(g(3, 1)) + a%i35 = f(3, 2) + i_times(g(3, 2)) + a%i36 = f(3, 3) + i_times(g(3, 3)) + +end + +!------------------------------------------------------------------------------- +subroutine clover_init3(a1, a2, p, q, s) + + use typedef_clover + implicit none + type(type_clover_a) :: a1, a2, p, q + REAL :: s + +# define CLOVER_INIT_3(I, J) \ +a1%i ## I ## J = s * (p%i ## I ## J + q%i ## I ## J ## ) ; \ +a2%i ## I ## J = s * (p%i ## I ## J - q%i ## I ## J ## ) + +! define => +! a1%iIJ = s * (p%iIJ + q%iIJ) ; a2%iIJ = s * (p%iIJ - q%iIJ) + + CLOVER_INIT_3(1, 1) + CLOVER_INIT_3(1, 2) + CLOVER_INIT_3(1, 3) + CLOVER_INIT_3(1, 4) + CLOVER_INIT_3(1, 5) + CLOVER_INIT_3(1, 6) + + CLOVER_INIT_3(2, 2) + CLOVER_INIT_3(2, 3) + CLOVER_INIT_3(2, 4) + CLOVER_INIT_3(2, 5) + CLOVER_INIT_3(2, 6) + + CLOVER_INIT_3(3, 3) + CLOVER_INIT_3(3, 4) + CLOVER_INIT_3(3, 5) + CLOVER_INIT_3(3, 6) + + CLOVER_INIT_3(4, 4) + CLOVER_INIT_3(4, 5) + CLOVER_INIT_3(4, 6) + + CLOVER_INIT_3(5, 5) + CLOVER_INIT_3(5, 6) + + CLOVER_INIT_3(6, 6) + + a1%i11 = a1%i11 + ONE + a1%i22 = a1%i22 + ONE + a1%i33 = a1%i33 + ONE + a1%i44 = a1%i44 + ONE + a1%i55 = a1%i55 + ONE + a1%i66 = a1%i66 + ONE + + a2%i11 = a2%i11 + ONE + a2%i22 = a2%i22 + ONE + a2%i33 = a2%i33 + ONE + a2%i44 = a2%i44 + ONE + a2%i55 = a2%i55 + ONE + a2%i66 = a2%i66 + ONE + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_inv.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_inv.F90 new file mode 100644 index 0000000000000000000000000000000000000000..6ec62f61115dd60d37fbdca26171f2a2bcba9888 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_inv.F90 @@ -0,0 +1,211 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! clover_inv.F90 - calculates inverse of clover matrix +! +!------------------------------------------------------------------------------- +# include "defs.h" +# include "clover.h" + +!------------------------------------------------------------------------------- +subroutine clover_inv(b, ainv, a) + + use typedef_clover + implicit none + type(type_clover_a), intent(inout) :: a + type(type_clover_a), intent(out) :: ainv + type(type_clover_b), intent(out) :: b + + REAL :: d1, d2, d3, d4, d5 + + ! statement function: + + COMPLEX :: z + REAL :: sq + sq(z) = (Re(z)**2 + Im(z)**2) + + d1 = A11 ! D1 + B11 = ONE / d1 ! 1 / D1 + + B21 = A21 * B11 ! L21 + + d2 = A22 - d1 * sq(B21) ! D2 + B22 = ONE / d2 ! 1 / D2 + + B31 = A31 ! L31 D1 + B32 = A32 - B31 * B12 ! L32 D2 + + B31 = B31 * B11 ! L31 + B32 = B32 * B22 ! L32 + + d3 = A33 - d1 * sq(B31) - d2 * sq(B32) ! D3 + B33 = ONE / d3 ! 1 / D3 + + B41 = A41 ! L41 D1 + B42 = A42 - B41 * B12 ! L42 D2 + B43 = A43 - B41 * B13 - B42 * B23 ! L43 D3 + + B41 = B41 * B11 ! L41 + B42 = B42 * B22 ! L42 + B43 = B43 * B33 ! L43 + + d4 = A44 - d1 * sq(B41) - d2 * sq(B42) - d3 * sq(B43) ! D4 + B44 = ONE / d4 ! 1 / D4 + + B51 = A51 + B52 = A52 - B51 * B12 + B53 = A53 - B51 * B13 - B52 * B23 + B54 = A54 - B51 * B14 - B52 * B24 - B53 * B34 + + B51 = B51 * B11 + B52 = B52 * B22 + B53 = B53 * B33 + B54 = B54 * B44 + + d5 = A55 - d1 * sq(B51) - d2 * sq(B52) - d3 * sq(B53) - d4 * sq(B54) + B55 = ONE / d5 + + B61 = A61 + B62 = A62 - B61 * B12 + B63 = A63 - B61 * B13 - B62 * B23 + B64 = A64 - B61 * B14 - B62 * B24 - B63 * B34 + B65 = A65 - B61 * B15 - B62 * B25 - B63 * B35 - B64 * B45 + + B61 = B61 * B11 + B62 = B62 * B22 + B63 = B63 * B33 + B64 = B64 * B44 + B65 = B65 * B55 + + B66 = A66 - d1 * sq(B61) - d2 * sq(B62) - d3 * sq(B63) & + - d4 * sq(B64) - d5 * sq(B65) + + B66 = ONE / B66 + + call clover_inv2(ainv, b) + + B11 = HALF * B11 + B22 = HALF * B22 + B33 = HALF * B33 + B44 = HALF * B44 + B55 = HALF * B55 + B66 = HALF * B66 + + A11 = HALF * A11 + A12 = HALF * A12 + A13 = HALF * A13 + A14 = HALF * A14 + A15 = HALF * A15 + A16 = HALF * A16 + + A22 = HALF * A22 + A23 = HALF * A23 + A24 = HALF * A24 + A25 = HALF * A25 + A26 = HALF * A26 + + A33 = HALF * A33 + A34 = HALF * A34 + A35 = HALF * A35 + A36 = HALF * A36 + + A44 = HALF * A44 + A45 = HALF * A45 + A46 = HALF * A46 + + A55 = HALF * A55 + A56 = HALF * A56 + + A66 = HALF * A66 + +end + +!------------------------------------------------------------------------------- +subroutine clover_inv2(a, b) + + use typedef_clover + implicit none + type(type_clover_a), intent(out) :: a + type(type_clover_b), intent(in) :: b + + COMPLEX, dimension(6) :: u, x, y + + call inv(1) + A11 = Re(x(1)) + A12 = x(2) + A13 = x(3) + A14 = x(4) + A15 = x(5) + A16 = x(6) + + call inv(2) + A22 = Re(x(2)) + A23 = x(3) + A24 = x(4) + A25 = x(5) + A26 = x(6) + + call inv(3) + A33 = Re(x(3)) + A34 = x(4) + A35 = x(5) + A36 = x(6) + + call inv(4) + A44 = Re(x(4)) + A45 = x(5) + A46 = x(6) + + call inv(5) + A55 = Re(x(5)) + A56 = x(6) + + call inv(6) + A66 = Re(x(6)) + + +CONTAINS + + subroutine inv(i) + + integer :: i + + u = ZERO + u(i) = ONE + + y(1) = u(1) + y(2) = u(2) - B21 * y(1) + y(3) = u(3) - B31 * y(1) - B32 * y(2) + y(4) = u(4) - B41 * y(1) - B42 * y(2) - B43 * y(3) + y(5) = u(5) - B51 * y(1) - B52 * y(2) - B53 * y(3) - B54 * y(4) + y(6) = u(6) - B61 * y(1) - B62 * y(2) - B63 * y(3) - B64 * y(4) - B65 * y(5) + + x(6) = y(6) * B66 + x(5) = y(5) * B55 - B56 * x(6) + x(4) = y(4) * B44 - B45 * x(5) - B46 * x(6) + x(3) = y(3) * B33 - B34 * x(4) - B35 * x(5) & + - B36 * x(6) + x(2) = y(2) * B22 - B23 * x(3) - B24 * x(4) & + - B25 * x(5) - B26 * x(6) + x(1) = y(1) * B11 - B12 * x(2) - B13 * x(3) & + - B14 * x(4) - B15 * x(5) - B16 * x(6) + + x(1) = HALF * conjg(x(1)) + x(2) = HALF * conjg(x(2)) + x(3) = HALF * conjg(x(3)) + x(4) = HALF * conjg(x(4)) + x(5) = HALF * conjg(x(5)) + x(6) = HALF * conjg(x(6)) + + end subroutine inv + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_a.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_a.F90 new file mode 100644 index 0000000000000000000000000000000000000000..2c03c4ad17b4c5a09bfd9f8684856e868364d596 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_a.F90 @@ -0,0 +1,87 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! clover_mult_a.F90 +! +!------------------------------------------------------------------------------- +# define CLOVER_AS_COMPLEX_ARRAY +# include "defs.h" +# include "clover.h" + +!------------------------------------------------------------------------------- +subroutine clover_mult_a(out, a, in, volh) ! out := A in + + implicit none + + COMPLEX, dimension(18, 2, *) :: a + COMPLEX, dimension(NDIRAC, NCOL, *) :: out, in + integer :: volh + + integer :: i + COMPLEX :: x1, x2, x3, x4, x5, x6 + COMPLEX :: y1, y2, y3, y4, y5, y6 + + TIMING_START(timing_bin_clover_mult_a) + + !$omp parallel do private(x1, x2, x3, x4, x5, x6, y1, y2, y3, y4, y5, y6) + do i = 1, volh + x1 = in(SC1, i) + in(SC7, i) + x2 = in(SC2, i) + in(SC8, i) + x3 = in(SC3, i) + in(SC9, i) + x4 = in(SC4, i) + in(SC10, i) + x5 = in(SC5, i) + in(SC11, i) + x6 = in(SC6, i) + in(SC12, i) + +# define J 1 +# include "clover_mult_a.h90" + + out(SC1, i) = y1 + out(SC2, i) = y2 + out(SC3, i) = y3 + out(SC4, i) = y4 + out(SC5, i) = y5 + out(SC6, i) = y6 + out(SC7, i) = y1 + out(SC8, i) = y2 + out(SC9, i) = y3 + out(SC10, i) = y4 + out(SC11, i) = y5 + out(SC12, i) = y6 + + x1 = in(SC1, i) - in(SC7, i) + x2 = in(SC2, i) - in(SC8, i) + x3 = in(SC3, i) - in(SC9, i) + x4 = in(SC4, i) - in(SC10, i) + x5 = in(SC5, i) - in(SC11, i) + x6 = in(SC6, i) - in(SC12, i) + +# undef J +# define J 2 +# include "clover_mult_a.h90" + + out(SC1, i) = out(SC1, i) + y1 + out(SC2, i) = out(SC2, i) + y2 + out(SC3, i) = out(SC3, i) + y3 + out(SC4, i) = out(SC4, i) + y4 + out(SC5, i) = out(SC5, i) + y5 + out(SC6, i) = out(SC6, i) + y6 + out(SC7, i) = out(SC7, i) - y1 + out(SC8, i) = out(SC8, i) - y2 + out(SC9, i) = out(SC9, i) - y3 + out(SC10, i) = out(SC10, i) - y4 + out(SC11, i) = out(SC11, i) - y5 + out(SC12, i) = out(SC12, i) - y6 + + enddo + + TIMING_STOP(timing_bin_clover_mult_a) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_a.h90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_a.h90 new file mode 100644 index 0000000000000000000000000000000000000000..49a8b2aaa3d48d29aefc6233d5379d6101708785 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_a.h90 @@ -0,0 +1,12 @@ + y1 = A11 * x1 + A12 * x2 + A13 * x3 & + + A14 * x4 + A15 * x5 + A16 * x6 + y2 = A21 * x1 + A22 * x2 + A23 * x3 & + + A24 * x4 + A25 * x5 + A26 * x6 + y3 = A31 * x1 + A32 * x2 + A33 * x3 & + + A34 * x4 + A35 * x5 + A36 * x6 + y4 = A41 * x1 + A42 * x2 + A43 * x3 & + + A44 * x4 + A45 * x5 + A46 * x6 + y5 = A51 * x1 + A52 * x2 + A53 * x3 & + + A54 * x4 + A55 * x5 + A56 * x6 + y6 = A61 * x1 + A62 * x2 + A63 * x3 & + + A64 * x4 + A65 * x5 + A66 * x6 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_ao.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_ao.F90 new file mode 100644 index 0000000000000000000000000000000000000000..c35d1656cbb4dcbfe1568bd8e2b59c6361d5ab4a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_ao.F90 @@ -0,0 +1,87 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! clover_mult_ao.F90 - ao: "A overwrite" +! +!------------------------------------------------------------------------------- +# define CLOVER_AS_COMPLEX_ARRAY +# include "defs.h" +# include "clover.h" + +!------------------------------------------------------------------------------- +subroutine clover_mult_ao(a, x, volh) ! x := A x + + implicit none + + COMPLEX, dimension(18, 2, *) :: a + COMPLEX, dimension(NDIRAC, NCOL, *) :: x + integer :: volh + + integer :: i + COMPLEX :: x1, x2, x3, x4, x5, x6 + COMPLEX :: y1, y2, y3, y4, y5, y6 + + TIMING_START(timing_bin_clover_mult_ao) + + !$omp parallel do private(x1, x2, x3, x4, x5, x6, y1, y2, y3, y4, y5, y6) + do i = 1, volh + x1 = x(SC1, i) + x(SC7, i) + x2 = x(SC2, i) + x(SC8, i) + x3 = x(SC3, i) + x(SC9, i) + x4 = x(SC4, i) + x(SC10, i) + x5 = x(SC5, i) + x(SC11, i) + x6 = x(SC6, i) + x(SC12, i) + +# define J 1 +# include "clover_mult_a.h90" + + x1 = x(SC1, i) - x(SC7, i) + x2 = x(SC2, i) - x(SC8, i) + x3 = x(SC3, i) - x(SC9, i) + x4 = x(SC4, i) - x(SC10, i) + x5 = x(SC5, i) - x(SC11, i) + x6 = x(SC6, i) - x(SC12, i) + + x(SC1, i) = y1 + x(SC2, i) = y2 + x(SC3, i) = y3 + x(SC4, i) = y4 + x(SC5, i) = y5 + x(SC6, i) = y6 + x(SC7, i) = y1 + x(SC8, i) = y2 + x(SC9, i) = y3 + x(SC10, i) = y4 + x(SC11, i) = y5 + x(SC12, i) = y6 + +# undef J +# define J 2 +# include "clover_mult_a.h90" + + x(SC1, i) = x(SC1, i) + y1 + x(SC2, i) = x(SC2, i) + y2 + x(SC3, i) = x(SC3, i) + y3 + x(SC4, i) = x(SC4, i) + y4 + x(SC5, i) = x(SC5, i) + y5 + x(SC6, i) = x(SC6, i) + y6 + x(SC7, i) = x(SC7, i) - y1 + x(SC8, i) = x(SC8, i) - y2 + x(SC9, i) = x(SC9, i) - y3 + x(SC10, i) = x(SC10, i) - y4 + x(SC11, i) = x(SC11, i) - y5 + x(SC12, i) = x(SC12, i) - y6 + + enddo + + TIMING_STOP(timing_bin_clover_mult_ao) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_b.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_b.F90 new file mode 100644 index 0000000000000000000000000000000000000000..b0d30d053b882b1cdbc68c9ab3b09d30956e096f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_b.F90 @@ -0,0 +1,88 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! clover_mult_b.F90 +! +!------------------------------------------------------------------------------- +# define CLOVER_AS_COMPLEX_ARRAY +# include "defs.h" +# include "clover.h" + +!------------------------------------------------------------------------------- +subroutine clover_mult_b(b, x, volh) ! x := B x + + implicit none + + COMPLEX, dimension(18, 2, *) :: b + COMPLEX, dimension(NDIRAC, NCOL, *) :: x + integer :: volh + + integer :: i + COMPLEX :: x1, x2, x3, x4, x5, x6 + COMPLEX :: y1, y2, y3, y4, y5, y6 + + TIMING_START(timing_bin_clover_mult_b) + + !$omp parallel do private(x1, x2, x3, x4, x5, x6, y1, y2, y3, y4, y5, y6) + do i = 1, volh + + y1 = x(SC1, i) + x(SC7, i) + y2 = x(SC2, i) + x(SC8, i) + y3 = x(SC3, i) + x(SC9, i) + y4 = x(SC4, i) + x(SC10, i) + y5 = x(SC5, i) + x(SC11, i) + y6 = x(SC6, i) + x(SC12, i) + +# define J 1 +# include "clover_mult_b.h90" + + y1 = x(SC1, i) - x(SC7, i) + y2 = x(SC2, i) - x(SC8, i) + y3 = x(SC3, i) - x(SC9, i) + y4 = x(SC4, i) - x(SC10, i) + y5 = x(SC5, i) - x(SC11, i) + y6 = x(SC6, i) - x(SC12, i) + + x(SC1, i) = x1 + x(SC2, i) = x2 + x(SC3, i) = x3 + x(SC4, i) = x4 + x(SC5, i) = x5 + x(SC6, i) = x6 + x(SC7, i) = x1 + x(SC8, i) = x2 + x(SC9, i) = x3 + x(SC10, i) = x4 + x(SC11, i) = x5 + x(SC12, i) = x6 + +# undef J +# define J 2 +# include "clover_mult_b.h90" + + x(SC1, i) = x(SC1, i) + x1 + x(SC2, i) = x(SC2, i) + x2 + x(SC3, i) = x(SC3, i) + x3 + x(SC4, i) = x(SC4, i) + x4 + x(SC5, i) = x(SC5, i) + x5 + x(SC6, i) = x(SC6, i) + x6 + x(SC7, i) = x(SC7, i) - x1 + x(SC8, i) = x(SC8, i) - x2 + x(SC9, i) = x(SC9, i) - x3 + x(SC10, i) = x(SC10, i) - x4 + x(SC11, i) = x(SC11, i) - x5 + x(SC12, i) = x(SC12, i) - x6 + + enddo + + TIMING_STOP(timing_bin_clover_mult_b) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_b.h90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_b.h90 new file mode 100644 index 0000000000000000000000000000000000000000..f38943b10b35aaf30abdd18030e26f65e67a6930 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_b.h90 @@ -0,0 +1,15 @@ + y2 = y2 - B21 * y1 + y3 = y3 - B31 * y1 - B32 * y2 + y4 = y4 - B41 * y1 - B42 * y2 - B43 * y3 + y5 = y5 - B51 * y1 - B52 * y2 - B53 * y3 - B54 * y4 + y6 = y6 - B61 * y1 - B62 * y2 - B63 * y3 - B64 * y4 - B65 * y5 + + x6 = y6 * B66 + x5 = y5 * B55 - B56 * x6 + x4 = y4 * B44 - B45 * x5 - B46 * x6 + x3 = y3 * B33 - B34 * x4 - B35 * x5 & + - B36 * x6 + x2 = y2 * B22 - B23 * x3 - B24 * x4 & + - B25 * x5 - B26 * x6 + x1 = y1 * B11 - B12 * x2 - B13 * x3 & + - B14 * x4 - B15 * x5 - B16 * x6 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_t_init.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_t_init.F90 new file mode 100644 index 0000000000000000000000000000000000000000..1c98847df50f08a5779ce46a72390f5d3b48243d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_t_init.F90 @@ -0,0 +1,56 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! clover_t_init.F90 - calculates T +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine clover_t_init(t, b) + + use typedef_clover + use module_vol + implicit none + + CLOVER_FIELD_C :: t + type(type_clover_b) :: b(2, volh) + + SPINCOL_FIELD :: x +!dir$ cache_align x + + integer :: c1, c2, s1, s2, i + + do c2 = 1, NCOL + do s2 = 1, NDIRAC + + x = ZERO + !$omp parallel do + do i = 1, volh + x(s2, c2, i) = ONE + enddo + + call clover_mult_b(b, x, volh) + + !$omp parallel do private(c1, s1) + do i = 1, volh + do c1 = 1, NCOL + do s1 = 1, NDIRAC + t(s1, c1, s2, c2, i) = x(s1, c1, i) + enddo + enddo + enddo + + enddo + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts.F90 new file mode 100644 index 0000000000000000000000000000000000000000..c0fb1a8f81f1bf8612951f5bb451f6d157736db8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts.F90 @@ -0,0 +1,165 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! clover_ts.F90 - calculates T * sigma +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine clover_ts(mu, nu, w, t) ! w = t sigma_mu_nu + + use module_vol + implicit none + + integer :: mu, nu + SU3_FIELD :: w + CLOVER_FIELD_C :: t + + if (mu == 1) then + if (nu == 2) then ; call clover_ts_12(w, t) + elseif (nu == 3) then ; call clover_ts_13(w, t) + elseif (nu == 4) then ; call clover_ts_14(w, t) ; endif + elseif (mu == 2) then + if (nu == 1) then ; call clover_ts_21(w, t) + elseif (nu == 3) then ; call clover_ts_23(w, t) + elseif (nu == 4) then ; call clover_ts_24(w, t) ; endif + elseif (mu == 3) then + if (nu == 1) then ; call clover_ts_31(w, t) + elseif (nu == 2) then ; call clover_ts_32(w, t) + elseif (nu == 4) then ; call clover_ts_34(w, t) ; endif + elseif (mu == 4) then + if (nu == 1) then ; call clover_ts_41(w, t) + elseif (nu == 2) then ; call clover_ts_42(w, t) + elseif (nu == 3) then ; call clover_ts_43(w, t) ; endif + endif +end + +!------------------------------------------------------------------------------- +subroutine clover_ts_12(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = -t(1, c1, 1, c2, i) & + + t(2, c1, 2, c2, i) & + - t(3, c1, 3, c2, i) & + + t(4, c1, 4, c2, i) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_21(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = t(1, c1, 1, c2, i) & + - t(2, c1, 2, c2, i) & + + t(3, c1, 3, c2, i) & + - t(4, c1, 4, c2, i) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_13(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = i_times(t(1, c1, 2, c2, i)) & + - i_times(t(2, c1, 1, c2, i)) & + + i_times(t(3, c1, 4, c2, i)) & + - i_times(t(4, c1, 3, c2, i)) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_31(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = -i_times(t(1, c1, 2, c2, i)) & + + i_times(t(2, c1, 1, c2, i)) & + - i_times(t(3, c1, 4, c2, i)) & + + i_times(t(4, c1, 3, c2, i)) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_14(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = t(1, c1, 4, c2, i) & + + t(2, c1, 3, c2, i) & + + t(3, c1, 2, c2, i) & + + t(4, c1, 1, c2, i) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_41(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = -t(1, c1, 4, c2, i) & + - t(2, c1, 3, c2, i) & + - t(3, c1, 2, c2, i) & + - t(4, c1, 1, c2, i) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_23(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = -t(1, c1, 2, c2, i) & + - t(2, c1, 1, c2, i) & + - t(3, c1, 4, c2, i) & + - t(4, c1, 3, c2, i) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_32(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = t(1, c1, 2, c2, i) & + + t(2, c1, 1, c2, i) & + + t(3, c1, 4, c2, i) & + + t(4, c1, 3, c2, i) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_24(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = i_times(t(1, c1, 4, c2, i)) & + - i_times(t(2, c1, 3, c2, i)) & + + i_times(t(3, c1, 2, c2, i)) & + - i_times(t(4, c1, 1, c2, i)) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_42(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = -i_times(t(1, c1, 4, c2, i)) & + + i_times(t(2, c1, 3, c2, i)) & + - i_times(t(3, c1, 2, c2, i)) & + + i_times(t(4, c1, 1, c2, i)) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_34(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = t(1, c1, 3, c2, i) & + - t(2, c1, 4, c2, i) & + + t(3, c1, 1, c2, i) & + - t(4, c1, 2, c2, i) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_43(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = -t(1, c1, 3, c2, i) & + + t(2, c1, 4, c2, i) & + - t(3, c1, 1, c2, i) & + + t(4, c1, 2, c2, i) +# include "clover_ts_tail.h90" + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts_head.h90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts_head.h90 new file mode 100644 index 0000000000000000000000000000000000000000..fb209369c093d42e29f54e163f8612a271a45387 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts_head.h90 @@ -0,0 +1,17 @@ + use module_vol + implicit none + + SU3_FIELD :: w + CLOVER_FIELD_C :: t + + integer :: i, c1, c2 + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + !$omp parallel do private(c1, c2) + do i = 1, volh + do c2 = 1, NCOL + do c1 = 1, NCOL diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts_tail.h90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts_tail.h90 new file mode 100644 index 0000000000000000000000000000000000000000..46655ebcdf9f049df5ea6453a96851d927592acb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts_tail.h90 @@ -0,0 +1,5 @@ + enddo + enddo + enddo + +end diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_uuu.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_uuu.F90 new file mode 100644 index 0000000000000000000000000000000000000000..0b513a2c746dba130f2208c72cad2dfb932439f3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_uuu.F90 @@ -0,0 +1,116 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! clover_uuu.F90 - multiplications of three SU(3) matrices +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine clover_uuu_uuu(r, u, v, w) ! r = u * v * w + + implicit none + SU3 :: r, u, v, w + integer :: i, j, k, l + + do i = 1, NCOL + do l = 1, NCOL + r(i, l) = ZERO + do j = 1, NCOL + do k = 1, NCOL + r(i, l) = r(i, l) + u(i, j) * v(j, k) * w(k, l) + enddo + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine clover_uuu_duu(r, u, v, w) ! r = u+ * v * w + + implicit none + SU3 :: r, u, v, w + integer :: i, j, k, l + + do i = 1, NCOL + do l = 1, NCOL + r(i, l) = ZERO + do j = 1, NCOL + do k = 1, NCOL + r(i, l) = r(i, l) + conjg(u(j, i)) * v(j, k) * w(k, l) + enddo + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine clover_uuu_udu(r, u, v, w) ! r = u * v+ * w + + implicit none + SU3 :: r, u, v, w + integer :: i, j, k, l + + do i = 1, NCOL + do l = 1, NCOL + r(i, l) = ZERO + do j = 1, NCOL + do k = 1, NCOL + r(i, l) = r(i, l) + u(i, j) * conjg(v(k, j)) * w(k, l) + enddo + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine clover_uuu_uud(r, u, v, w) ! r = u * v * w+ + + implicit none + SU3 :: r, u, v, w + integer :: i, j, k, l + + do i = 1, NCOL + do l = 1, NCOL + r(i, l) = ZERO + do j = 1, NCOL + do k = 1, NCOL + r(i, l) = r(i, l) + u(i, j) * v(j, k) * conjg(w(l, k)) + enddo + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine clover_uuu_dud(r, u, v, w) ! r = u+ * v * w+ + + implicit none + SU3 :: r, u, v, w + integer :: i, j, k, l + + do i = 1, NCOL + do l = 1, NCOL + r(i, l) = ZERO + do j = 1, NCOL + do k = 1, NCOL + r(i, l) = r(i, l) + conjg(u(j, i)) * v(j, k) * conjg(w(l, k)) + enddo + enddo + enddo + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_uuuu.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_uuuu.F90 new file mode 100644 index 0000000000000000000000000000000000000000..a9994b8dc041702c348dea588210dc4e4162e017 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_uuuu.F90 @@ -0,0 +1,117 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! clover_uuuu.F90 - multiplications of four SU(3) matrices +! +!------------------------------------------------------------------------------- +! +! --<-- --<-- +! | | | | +! v 2 ^ v 1 ^ +! | | | | +! -->-- -->-- +! x +! --<-- --<-- +! | | | | +! v 3 ^ v 4 ^ +! | | | | +! -->-- -->-- +! +! uuuu1: uuuu += u1 u2 u3+ u4+ ! + = dagger +! uuuu2: uuuu += u1 u2+ u3+ u4 +! uuuu3: uuuu += u1+ u2+ u3 u4 +! uuuu4: uuuu += u1+ u2 u3 u4+ +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine clover_uuuu1(uuuu, u1, u2, u3, u4) + + implicit none + SU3 :: uuuu, u1, u2, u3, u4 + integer :: i, j, k, l, m + + do i = 1, NCOL + do m = 1, NCOL + do j = 1, NCOL + do k = 1, NCOL + do l = 1, NCOL + uuuu(i,m)= uuuu(i,m)+ u1(i,j) * u2(j,k) * conjg(u3(l,k)) * conjg(u4(m,l)) + enddo + enddo + enddo + enddo + enddo +end + +!------------------------------------------------------------------------------- +subroutine clover_uuuu2(uuuu, u1, u2, u3, u4) + + implicit none + SU3 :: uuuu, u1, u2, u3, u4 + integer :: i, j, k, l, m + + do i = 1, NCOL + do m = 1, NCOL + do j = 1, NCOL + do k = 1, NCOL + do l = 1, NCOL + uuuu(i,m)= uuuu(i,m)+ u1(i,j) * conjg(u2(k,j)) * conjg(u3(l,k)) * u4(l,m) + enddo + enddo + enddo + enddo + enddo +end + + + +!------------------------------------------------------------------------------- +subroutine clover_uuuu3(uuuu, u1, u2, u3, u4) + + implicit none + SU3 :: uuuu, u1, u2, u3, u4 + integer :: i, j, k, l, m + + do i = 1, NCOL + do m = 1, NCOL + do j = 1, NCOL + do k = 1, NCOL + do l = 1, NCOL + uuuu(i,m)= uuuu(i,m)+ conjg(u1(j,i)) * conjg(u2(k,j)) * u3(k,l) * u4(l,m) + enddo + enddo + enddo + enddo + enddo +end + +!------------------------------------------------------------------------------- +subroutine clover_uuuu4(uuuu, u1, u2, u3, u4) + + implicit none + SU3 :: uuuu, u1, u2, u3, u4 + integer :: i, j, k, l, m + + do i = 1, NCOL + do m = 1, NCOL + do j = 1, NCOL + do k = 1, NCOL + do l = 1, NCOL + uuuu(i,m)= uuuu(i,m)+ conjg(u1(j,i)) * u2(j,k) * u3(k,l) * conjg(u4(m,l)) + enddo + enddo + enddo + enddo + enddo +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/ctest.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/ctest.F90 new file mode 100644 index 0000000000000000000000000000000000000000..942ab53a936812d9bdec4f106ad51c82df549b8f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/ctest.F90 @@ -0,0 +1,151 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! ctest.F90 - test of clover matrix multiplications: is (A * inv(A) = 1) ? +! +!------------------------------------------------------------------------------- +# include "defs.h" +# include "clover.h" + +!------------------------------------------------------------------------------- +program ctest + + use typedef_clover + implicit none + + integer, parameter :: volh = 1 + integer, parameter :: nz = 1 + + type(type_clover_a) :: a(2, volh), ainv(2, volh) + type(type_clover_b) :: b(2, volh) + + COMPLEX, dimension(NDIRAC, NCOL, volh) :: z, r + + integer :: i, j, s, c + + do i = 1, volh + do j = 1, 2 + call cinit(a(j, i)) + call clover_inv(b(j, i), ainv(j, i), a(j, i)) + enddo + enddo + + + do j = 1,12 + call zinit(z, j, volh) + !!call clover_mult_b(b, z, volh) + call clover_mult_ao(ainv, z, volh) + call clover_mult_a(r, a, z, volh) + call zwrite(r, j, volh) + +! call zinit(z, j+6, volh) +! call clover_mult_b(b, z, volh) +! call zwrite(z, j+6, volh) + +! call zinit(z, j+6, volh) +! call clover_mult_a(r, a, z, volh) +! call zwrite(r, j+6, volh) + + +!! call clover_mult_b(b, r, volh) + !!call zwrite(r, j, volh) + enddo + +end + +!------------------------------------------------------------------------------- +subroutine cinit(a) + + use typedef_clover + implicit none + type(type_clover_a) :: a + real, intrinsic :: ranf + + A11 = ranf() + A22 = ranf() + A33 = ranf() + A44 = ranf() + A55 = ranf() + A66 = ranf() + + A12 = cmplx(ranf(), ranf()) + A13 = cmplx(ranf(), ranf()) + A14 = cmplx(ranf(), ranf()) + A15 = cmplx(ranf(), ranf()) + A16 = cmplx(ranf(), ranf()) + + A23 = cmplx(ranf(), ranf()) + A24 = cmplx(ranf(), ranf()) + A25 = cmplx(ranf(), ranf()) + A26 = cmplx(ranf(), ranf()) + + A34 = cmplx(ranf(), ranf()) + A35 = cmplx(ranf(), ranf()) + A36 = cmplx(ranf(), ranf()) + + A45 = cmplx(ranf(), ranf()) + A46 = cmplx(ranf(), ranf()) + + A56 = cmplx(ranf(), ranf()) + +end + +!------------------------------------------------------------------------------- +subroutine zinit(z, j, volh) + + implicit none + integer :: volh, i, j + COMPLEX, dimension(NDIRAC, NCOL, volh) :: z + + z = 0 + do i = 1, volh + if (j == 1) z(SC1, i) = 1 + if (j == 2) z(SC2, i) = 1 + if (j == 3) z(SC3, i) = 1 + if (j == 4) z(SC4, i) = 1 + if (j == 5) z(SC5, i) = 1 + if (j == 6) z(SC6, i) = 1 + if (j == 7) z(SC7, i) = 1 + if (j == 8) z(SC8, i) = 1 + if (j == 9) z(SC9, i) = 1 + if (j == 10) z(SC10, i) = 1 + if (j == 11) z(SC11, i) = 1 + if (j == 12) z(SC12, i) = 1 + enddo + +end + +!------------------------------------------------------------------------------- +subroutine zwrite(z, j, volh) + + implicit none + integer :: volh, i, j + COMPLEX, dimension(NDIRAC, NCOL, volh) :: z + + write(6,*) "-----------------------------------------------" + do i = 1, volh + write(6, "(4i4,2f16.8)") j, i, SC1, z(SC1, i) + write(6, "(4i4,2f16.8)") j, i, SC2, z(SC2, i) + write(6, "(4i4,2f16.8)") j, i, SC3, z(SC3, i) + write(6, "(4i4,2f16.8)") j, i, SC4, z(SC4, i) + write(6, "(4i4,2f16.8)") j, i, SC5, z(SC5, i) + write(6, "(4i4,2f16.8)") j, i, SC6, z(SC6, i) + write(6,*) + write(6, "(4i4,2f16.8)") j, i, SC7, z(SC7, i) + write(6, "(4i4,2f16.8)") j, i, SC8, z(SC8, i) + write(6, "(4i4,2f16.8)") j, i, SC9, z(SC9, i) + write(6, "(4i4,2f16.8)") j, i, SC10, z(SC10, i) + write(6, "(4i4,2f16.8)") j, i, SC11, z(SC11, i) + write(6, "(4i4,2f16.8)") j, i, SC12, z(SC12, i) + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..8dd716e619545bfc4323d61b9c446881b58769d1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/Makefile @@ -0,0 +1,89 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# comm/Makefile +# +#=============================================================================== + +include ../Makefile.defs + +MODULES_DIR = ../modules + +.SUFFIXES: +.SUFFIXES: .a .o .F90 + +.F90.o: + $(FPP) -I.. $(FPPFLAGS) $< > $*.f90 + $(F90) -c $(FFLAGS) -I$(MODULES_DIR) $*.f90 + + +OBJS_MPI = \ + dotprod.o \ + comm_mpi.o \ + allocate.o \ + field_io_mpi.o \ + pes_mpi.o \ + reduction_mpi.o \ + seed_mpi.o \ + xbound_mpi.o + +OBJS_SHMEM = \ + dotprod.o \ + comm_shmem.o \ + allocate_shmem.o \ + field_io_shmem.o \ + reduction_shmem.o \ + seed_shmem.o \ + xbound_shmem.o + +OBJS_SHMEMPI = \ + dotprod.o \ + comm_shmempi.o \ + allocate_shmem.o \ + field_io_mpi.o \ + pes_mpi.o \ + reduction_mpi.o \ + seed_mpi.o \ + xbound_shmem.o + +OBJS_SINGLE_PE = \ + dotprod.o \ + allocate.o \ + comm_single_pe.o \ + field_io_single_pe.o \ + pes_single_pe.o \ + reduction_single_pe.o \ + seed_single_pe.o \ + xbound_single_pe.o + +$(LIBCOMM): + +fast: + $(FAST_MAKE) + +lib_mpi.a: $(OBJS_MPI) + $(AR) $(ARFLAGS) $@ $(OBJS_MPI) + $(RANLIB) $@ + +lib_shmem.a: $(OBJS_SHMEM) + $(AR) $(ARFLAGS) $@ $(OBJS_SHMEM) + $(RANLIB) $@ + +lib_shmempi.a: $(OBJS_SHMEMPI) + $(AR) $(ARFLAGS) $@ $(OBJS_SHMEMPI) + $(RANLIB) $@ + +lib_single_pe.a: $(OBJS_SINGLE_PE) + $(AR) $(ARFLAGS) $@ $(OBJS_SINGLE_PE) + $(RANLIB) $@ + +clobber: + rm -f *.[Tiod] *.f90 *.mod work.pc work.pcl + rm -f lib_mpi.a lib_shmem.a lib_shmempi.a lib_single_pe.a diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/allocate.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/allocate.F90 new file mode 100644 index 0000000000000000000000000000000000000000..6993128034183d80f1efac6cb00aa360b7a8bfb2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/allocate.F90 @@ -0,0 +1,133 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! allocate.F90 - allocation of gauge and pseudo fermion fields +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine allocate_g_field(u) + + use module_vol + implicit none + P_GAUGE_FIELD :: u + + if (associated(u)) then + call die("allocate_g_field(): memory leak") + else + allocate(u(NCOL, NCOL, volh_tot, EVEN:ODD, DIM)) + call conf_zero(u) + endif +end + +!------------------------------------------------------------------------------- +subroutine allocate_g_field_io(u) + + use module_lattice_io + implicit none + P_GAUGE_FIELD_IO :: u + + if (associated(u)) then + call die("allocate_g_field_io(): memory leak") + else + allocate(u(NCOL, NCOL-1, DIM, 0:NX-1, 0:NY-1, 0:NZ-1, 0:NT-1)) + endif +end + +!------------------------------------------------------------------------------- +subroutine allocate_gen_field(x) + + use module_vol + implicit none + P_GENERATOR_FIELD :: x + + integer :: i, eo, mu + + if (associated(x)) then + call die("allocate_gen_field(): memory leak") + else + allocate(x(NGEN, volh_tot, EVEN:ODD, DIM)) + do mu = 1, DIM + do eo = EVEN, ODD + !$omp parallel do + do i = 1, volh + x(1, i, eo, mu) = ZERO + x(2, i, eo, mu) = ZERO + x(3, i, eo, mu) = ZERO + x(4, i, eo, mu) = ZERO + x(5, i, eo, mu) = ZERO + x(6, i, eo, mu) = ZERO + x(7, i, eo, mu) = ZERO + x(8, i, eo, mu) = ZERO + enddo + enddo + enddo + endif +end + +!------------------------------------------------------------------------------- +subroutine allocate_sc_field(x) + + use module_vol + implicit none + P_SPINCOL_FIELD :: x + + if (associated(x)) then + call die("allocate_sc_field(): memory leak") + else + allocate(x(NDIRAC, NCOL, volh_tot)) + call sc_zero(x) + endif +end + +!------------------------------------------------------------------------------- +subroutine allocate_sc_field_io(x) + + use module_lattice_io + implicit none + P_SPINCOL_FIELD_IO :: x + + if (associated(x)) then + call die("allocate_sc_field_io(): memory leak") + else + allocate(x(NDIRAC, NCOL, 0:NXH-1, 0:NY-1, 0:NZ-1, 0:NT-1)) + endif +end + +!------------------------------------------------------------------------------- +subroutine allocate_sc_overindexed(x) + + use module_vol + implicit none + P_SPINCOL_OVERINDEXED :: x + + if (associated(x)) then + call die("allocate_sc_overindexed(): memory leak") + else + allocate(x(SIZE_COMPLEX*NDIRAC*NCOL*volh_tot)) + endif +end + +!------------------------------------------------------------------------------- +subroutine allocate_sc2_field(x) + + use module_vol + implicit none + P_SC2_FIELD :: x + + if (associated(x)) then + call die("allocate_sc2_field(): memory leak") + else + allocate(x(2, NCOL, volh_tot, DIM, FWD:BWD)) + endif +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/allocate_shmem.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/allocate_shmem.F90 new file mode 100644 index 0000000000000000000000000000000000000000..8e79bcea19c76dca26bf02917496e3b60f04f2cb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/allocate_shmem.F90 @@ -0,0 +1,229 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! allocate_shmem.F90 - allocation of gauge and pseudo fermion fields using shmem +! +!------------------------------------------------------------------------------- +# include "defs.h" +# include "shmem.h" + +!------------------------------------------------------------------------------- +subroutine allocate_g_field(u) + + use module_vol + implicit none + P_GAUGE_FIELD :: u + + GAUGE_FIELD :: uu + pointer (p_uu, uu) + + integer :: ierr + + if (associated(u)) call die("allocate_g_field(): memory leak") + + call barrier() + call shpalloc(p_uu, SIZE_COMPLEX * NCOL * NCOL * volh_tot * 2 * DIM, ierr, 1) + call cray_pointer_to_f90_pointer(uu) + +CONTAINS + + subroutine cray_pointer_to_f90_pointer(uu) + + implicit none + GAUGE_FIELD, target :: uu + + u => uu + end subroutine cray_pointer_to_f90_pointer + +end + +!------------------------------------------------------------------------------- +subroutine allocate_g_field_io(u) + + use module_lattice_io + use module_vol + implicit none + P_GAUGE_FIELD_IO :: u + + GAUGE_FIELD_IO :: uu + pointer (p_uu, uu) + + integer :: ierr + + if (associated(u)) call die("allocate_g_field_io(): memory leak") + + call barrier() + call shpalloc(p_uu, SIZE_COMPLEX * NCOL * (NCOL-1) * DIM * vol, ierr, 1) + call cray_pointer_to_f90_pointer(uu) + +CONTAINS + + subroutine cray_pointer_to_f90_pointer(uu) + + implicit none + GAUGE_FIELD_IO, target :: uu + + u => uu + end subroutine cray_pointer_to_f90_pointer + +end + +!------------------------------------------------------------------------------- +subroutine allocate_gen_field(u) + + use module_vol + implicit none + P_GENERATOR_FIELD :: u + + GENERATOR_FIELD :: uu + pointer (p_uu, uu) + + integer :: ierr + + if (associated(u)) call die("allocate_gen_field(): memory leak") + + call barrier() + call shpalloc(p_uu, NGEN * volh_tot * 2 * DIM, ierr, 1) + call cray_pointer_to_f90_pointer(uu) + +CONTAINS + + subroutine cray_pointer_to_f90_pointer(uu) + + implicit none + GENERATOR_FIELD, target :: uu + + u => uu + end subroutine cray_pointer_to_f90_pointer + +end + +!------------------------------------------------------------------------------- +subroutine allocate_sc_field(x) + + use module_vol + implicit none + P_SPINCOL_FIELD :: x + + SPINCOL_FIELD :: xx + pointer (p_xx, xx) + + integer :: ierr + + if (associated(x)) call die("allocate_sc_field(): memory leak") + + call barrier() + call shpalloc(p_xx, SIZE_COMPLEX * NDIRAC * NCOL * volh_tot, ierr, 1) + call cray_pointer_to_f90_pointer(xx) + +CONTAINS + + subroutine cray_pointer_to_f90_pointer(xx) + + implicit none + SPINCOL_FIELD, target :: xx + + x => xx + end subroutine cray_pointer_to_f90_pointer + +end + +!------------------------------------------------------------------------------- +subroutine allocate_sc2_field(x) + + use module_vol + implicit none + P_SC2_FIELD :: x + + SC2_FIELD :: xx + pointer (p_xx, xx) + + integer :: ierr + + if (associated(x)) call die("allocate_sc2_field(): memory leak") + + call barrier() + call shpalloc(p_xx, SIZE_COMPLEX * 2 * NCOL * volh_tot * DIM * 2, ierr, 1) + call cray_pointer_to_f90_pointer(xx) + +CONTAINS + + subroutine cray_pointer_to_f90_pointer(xx) + + implicit none + SC2_FIELD, target :: xx + + x => xx + end subroutine cray_pointer_to_f90_pointer + +end + +!------------------------------------------------------------------------------- +subroutine allocate_sc_field_io(x) + + use module_lattice_io + use module_vol + implicit none + P_SPINCOL_FIELD_IO :: x + + SPINCOL_FIELD_IO :: xx + pointer (p_xx, xx) + + integer :: ierr + + if (associated(x)) call die("allocate_sc_field_io(): memory leak") + + call barrier() + call shpalloc(p_xx, SIZE_COMPLEX * NDIRAC * NCOL * volh, ierr, 1) + call cray_pointer_to_f90_pointer(xx) + +CONTAINS + + subroutine cray_pointer_to_f90_pointer(xx) + + implicit none + SPINCOL_FIELD_IO, target :: xx + + x => xx + end subroutine cray_pointer_to_f90_pointer + +end + +!------------------------------------------------------------------------------- +subroutine allocate_sc_overindexed(x) + + use module_vol + implicit none + P_SPINCOL_OVERINDEXED :: x + + SPINCOL_OVERINDEXED :: xx + pointer (p_xx, xx) + + integer :: ierr + + if (associated(x)) call die("allocate_sc_overindexed(): memory leak") + + call barrier() + call shpalloc(p_xx, SIZE_COMPLEX * NDIRAC * NCOL * volh_tot, ierr, 1) + call cray_pointer_to_f90_pointer(xx) + +CONTAINS + + subroutine cray_pointer_to_f90_pointer(xx) + + implicit none + SPINCOL_OVERINDEXED, target :: xx + + x => xx + end subroutine cray_pointer_to_f90_pointer + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/bqcd.pcl b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/bqcd.pcl new file mode 100644 index 0000000000000000000000000000000000000000..906244500b31700684482c3dcfd32f6cec4279db --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/bqcd.pcl @@ -0,0 +1,2 @@ +work.pc +../modules/work.pc diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_mpi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_mpi.F90 new file mode 100644 index 0000000000000000000000000000000000000000..cf4de03f19d6fd754f7db9491642aab5330eb56c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_mpi.F90 @@ -0,0 +1,46 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! comm_mpi.F90 - wrapper for MPI routines +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine comm_init() + + implicit none + include 'mpif.h' + integer ierror + +! call mpi_init(ierror) +end + +!------------------------------------------------------------------------------- +subroutine comm_finalize() + + implicit none + include 'mpif.h' + integer ierror + +! call mpi_finalize(ierror) +end + +!------------------------------------------------------------------------------- +COMM_METHOD function comm_method() + +#ifdef _OPENMP + comm_method = "MPI + OpenMP" +#else + comm_method = "MPI" +#endif +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_shmem.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_shmem.F90 new file mode 100644 index 0000000000000000000000000000000000000000..84a096d709fea1ae7a90083fce27df6ff44cd4da --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_shmem.F90 @@ -0,0 +1,44 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! comm_shmem.F90 - routines for shmem versions +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine comm_init() +#ifdef ALTIX + call start_pes(0) +#endif + return +end + +!------------------------------------------------------------------------------- +subroutine comm_finalize() + return +end + +!------------------------------------------------------------------------------- +COMM_METHOD function comm_method() + +#ifdef _OPENMP + comm_method = "shmem + OpenMP" +#else + comm_method = "shmem" +#endif +end + +!------------------------------------------------------------------------------- +integer function get_d3_buffer_vol() + get_d3_buffer_vol = 0 +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_shmempi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_shmempi.F90 new file mode 100644 index 0000000000000000000000000000000000000000..d592215714ab6e724bcec00ad5138d5601589336 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_shmempi.F90 @@ -0,0 +1,51 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! comm_shmempi.F90 - MPI + shmem on Altix +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine comm_init() + + implicit none + include 'mpif.h' + integer ierror + + call mpi_init(ierror) +end + +!------------------------------------------------------------------------------- +subroutine comm_finalize() + + implicit none + include 'mpif.h' + integer ierror + + call mpi_finalize(ierror) +end + +!------------------------------------------------------------------------------- +COMM_METHOD function comm_method() + +#ifdef _OPENMP + comm_method = "shmem/MPI + OpenMP" +#else + comm_method = "shmem/MPI" +#endif +end + +!------------------------------------------------------------------------------- +integer function get_d3_buffer_vol() + get_d3_buffer_vol = 0 +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_single_pe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_single_pe.F90 new file mode 100644 index 0000000000000000000000000000000000000000..a977e50363d06772a61c3176c267543584f705f3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_single_pe.F90 @@ -0,0 +1,41 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! comm_single_pe.F90 - (dummy) routines for single CPU version +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine comm_init() + return +end + +!------------------------------------------------------------------------------- +subroutine comm_finalize() + return +end + +!------------------------------------------------------------------------------- +COMM_METHOD function comm_method() + +#ifdef _OPENMP + comm_method = "single_pe + OpenMP" +#else + comm_method = "single_pe" +#endif +end + +!------------------------------------------------------------------------------- +integer function get_d3_buffer_vol() + get_d3_buffer_vol = 0 +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/dotprod.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/dotprod.F90 new file mode 100644 index 0000000000000000000000000000000000000000..3600c9039f33867db6ec3cee765797bc041d0fe4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/dotprod.F90 @@ -0,0 +1,33 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! dotprod.F90 - dot product for parallel computers +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +REAL function dotprod(a, b, n) + + implicit none + integer i, n + REAL a(n), b(n), s, global_sum + + s = ZERO + !$omp parallel do reduction(+: s) + do i = 1, n + s = s + a(i) * b(i) + enddo + + dotprod = global_sum(s) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_mpi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_mpi.F90 new file mode 100644 index 0000000000000000000000000000000000000000..83843827fafee23e802b7692cb5bab6482e1ef2a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_mpi.F90 @@ -0,0 +1,240 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! field_io_mpi.F90 - I/O routine for gauge and pseudo fermion fields using MPI +! +!------------------------------------------------------------------------------- +# include "defs.h" + +#ifndef INCLUDE_MPIF_H +#define INCLUDE_MPIF_H include 'mpif.h' +#endif + +!------------------------------------------------------------------------------- +subroutine field_io(action, m, mx, field, cksum) ! read or write g- or sc-field + + use typedef_cksum + use module_decomp + use module_function_decl + use module_lattice_io + use module_vol + implicit none + INCLUDE_MPIF_H + + character(len = *) :: action + integer :: m, mx + type(type_cksum) :: cksum(0:LT - 1) + + REAL :: field(SIZE_COMPLEX * m, 0:mx - 1, 0:NY - 1, 0:NZ - 1, 0:NT - 1) + + REAL :: buffer(0:(SIZE_COMPLEX * m * mx * NPE(1))-1, 0:(NY * NPE(2))-1) + + FILENAME :: file + integer :: i_pe(DIM) + integer, external :: i_global, ilex + integer :: x, y, z, t, t_global, me, pe, size, rec, recl + integer :: size_field + integer :: pe_x, pe_y, pe_z, pe_t + CHECK_SUM :: check_sum(2), n_bytes + integer :: tt, pe_tt, pe_io + integer :: status(MPI_STATUS_SIZE), ierror + integer :: count, block_length, stride, buf_type + logical :: io_pe + + + count = NY + block_length = SIZE_COMPLEX * m * mx + stride = block_length * NPE(1) + + size = block_length * NY ! words in send/recv + size_field = block_length * NY * NZ * NT ! words in "field" + n_bytes = size * NPE(1) * NPE(2) * RKIND ! bytes in "buffer" + recl = n_bytes ! cast to standard integer + + ASSERT(mod(recl, RECL_UNIT) == 0) + recl = recl / RECL_UNIT + + i_pe = decomp%std%i_pe + + call mpi_type_vector(count, block_length, stride, BQCD_REAL, buf_type, ierror) + call mpi_type_commit(buf_type, ierror) + + if (action == "write") call swap_endian8(size_field, field) + + if (i_pe(1) == 0 .and. i_pe(2) == 0 .and. i_pe(3) == 0) then + io_pe = .true. + else + io_pe = .false. + endif + + pe_t = i_pe(4) + do t = 0, NT - 1 + t_global = i_global(t, NT, i_pe(4)) + + file = cksum(t_global)%file + + if (io_pe) then + open(UCONF, file = file, action = action, form = "unformatted", & + access = "direct", recl = recl) + endif + + rec = 0 + call cksum_init() + + do pe_z = 0, NPE(3) - 1 + do z = 0, NZ - 1 + rec = rec + 1 + + if (io_pe .and. action == "read") then + read(UCONF, rec = rec) buffer + call cksum_add(buffer, n_bytes) + endif + + do pe_y = 0, NPE(2) - 1 + do pe_x = 0, NPE(1) - 1 + + y = count * pe_y + x = block_length * pe_x + + call field_io_pes(pe, pe_io, (/pe_x, pe_y, pe_z, pe_t/)) + + if (io_pe) then + if (my_pe() /= pe_io) call die("my_pe() /= pe_io") + endif + + if (my_pe() == pe .and. my_pe() == pe_io) then + call field_io_seq(action, count, block_length, stride, & + field(1,0,0,z,t), buffer(x,y)) + else + if (action == "read") then + if (my_pe() == pe_io) then + call mpi_ssend(buffer(x,y), 1, buf_type, & + pe, 0, MPI_COMM_WORLD, ierror) + endif + if (my_pe() == pe) then + call mpi_recv(field(1,0,0,z,t), size, BQCD_REAL, & + pe_io, 0, MPI_COMM_WORLD, status, ierror) + endif + else + if (my_pe() == pe_io) then + call mpi_recv(buffer(x,y), 1, buf_type, & + pe, 0, MPI_COMM_WORLD, status, ierror) + endif + if (my_pe() == pe) then + call mpi_ssend(field(1,0,0,z,t), size, BQCD_REAL, & + pe_io, 0, MPI_COMM_WORLD, ierror) + endif + endif + endif + + enddo + enddo + + if (io_pe .and. action == "write") then + write(UCONF, rec = rec) buffer + call cksum_add(buffer, n_bytes) + endif + enddo + enddo + + if (io_pe) then + close(UCONF) + call cksum_get(check_sum(1), check_sum(2)) + + if (action == "read") then + + if (check_sum(1) /= cksum(t_global)%sum) then + call die("field_io(): check sum error in file " // file) + endif + + else + + if (my_pe() == 0) then + cksum(t_global)%sum = check_sum(1) + cksum(t_global)%bytes = check_sum(2) + do pe_tt = 1, NPE(4) - 1 + tt = i_global(t, NT, pe_tt) + call mpi_recv(check_sum, 2, BQCD_CHECK_SUM, & + MPI_ANY_SOURCE, tt, MPI_COMM_WORLD, status, ierror) + cksum(tt)%sum = check_sum(1) + cksum(tt)%bytes = check_sum(2) + enddo + else + call mpi_ssend(check_sum, 2, BQCD_CHECK_SUM, 0, t_global, & + MPI_COMM_WORLD, ierror) + endif + endif + endif + enddo + + call swap_endian8(size_field, field) + call mpi_type_free(buf_type, ierror) +end + +!------------------------------------------------------------------------------- +subroutine field_io_seq(action, count, block_length, stride, field, buffer) + + use module_lattice_io + use module_vol + implicit none + + character(len = *) :: action + integer :: count, block_length, stride + REAL :: field(*) + REAL :: buffer(*) + integer :: i, j, x, y + + i = 0 + j = 0 + do y = 1, count + do x = 1, block_length + i = i + 1 + if (action == "read") then + field(i) = buffer(j + x) + else + buffer(j + x) = field(i) + endif + enddo + j = j + stride + enddo + +end + +!------------------------------------------------------------------------------- +subroutine field_io_pes(pe, pe_io, x_std) + + use module_lattice ! in contrast to the calling routine !! + implicit none + integer, intent(out) :: pe, pe_io + integer, intent(in) :: x_std(DIM) + integer :: x_act(DIM), x_std_io(DIM), x_act_io(DIM) + integer, external :: ilex + + x_std_io(1) = 0 + x_std_io(2) = 0 + x_std_io(3) = 0 + x_std_io(4) = x_std(4) + + x_act(1) = x_std(gamma_index(1)) + x_act(2) = x_std(gamma_index(2)) + x_act(3) = x_std(gamma_index(3)) + x_act(4) = x_std(gamma_index(4)) + + x_act_io(1) = x_std_io(gamma_index(1)) + x_act_io(2) = x_std_io(gamma_index(2)) + x_act_io(3) = x_std_io(gamma_index(3)) + x_act_io(4) = x_std_io(gamma_index(4)) + + pe = ilex(DIM, x_act, NPE) + pe_io = ilex(DIM, x_act_io, NPE) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_shmem.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_shmem.F90 new file mode 100644 index 0000000000000000000000000000000000000000..00da858897bd42b72f8acc299cb36ed6c6481034 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_shmem.F90 @@ -0,0 +1,136 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! field_io_shmem.F90 - I/O routine for gauge and pseudo fermion fields (shmem) +! +!------------------------------------------------------------------------------- +# include "defs.h" +# include "shmem.h" + +!------------------------------------------------------------------------------- +subroutine field_io(action, m, mx, field, cksum) ! read or write g- or sc-field + + use typedef_cksum + use module_function_decl + use module_lattice + use module_vol + implicit none + + character(len = *) :: action + integer :: m, mx + COMPLEX :: field(m, 0:mx-1, 0:NY-1, 0:NZ-1, 0:NT-1) + type(type_cksum) :: cksum(0:LT-1) + +!!! COMPLEX :: buffer(m * mx * NY, 0:NPE(2)-1) +!!!!dir$ symmetric buffer + FILENAME :: file + integer :: i_pe(DIM) + integer, external :: i_global, ilex + integer :: t, t_global, z, me, pe, size, rec, recl + integer :: size_field + integer :: pe_x, pe_y, pe_z, pe_t + CHECK_SUM :: check_sum(2), n_bytes + + COMPLEX :: buffer(m * mx * NY, 0:NPE(2)-1) + CHECK_SUM :: check_sum_master(2, 0:LT-1) + + pointer(p_buffer, buffer) + pointer(p_check_sum_master, check_sum_master) + + save p_buffer + save p_check_sum_master + + logical, save :: initialized = .false. + integer :: ierr + + if (.not. initialized) then + call barrier() + call shpalloc(p_buffer, SIZE_COMPLEX * m * mx * NY * NPE(2), ierr, 1) + call barrier() + call shpalloc(p_check_sum_master, 2 * LT, ierr, 1) + initialized = .true. + endif + + call barrier() + + call unlex(my_pe(), DIM, i_pe, NPE) + + if (i_pe(2) == 0 .and. i_pe(3) == 0) then + size = SIZE_COMPLEX * m * mx * NY ! size in shmem + size_field = size * NZ * NT ! size of "field" + n_bytes = size * RKIND * NPE(2) ! no. of bytes of u_buf + recl = n_bytes ! cast to standard integer + + ASSERT(mod(recl, RECL_UNIT) == 0) + recl = recl / RECL_UNIT + + if (action == "write") call swap_endian8(size_field, field) + + pe_t = i_pe(4) + do t = 0, NT - 1 + t_global = i_global(t, NT, i_pe(4)) + + file = cksum(t_global)%file + + open(UCONF, file = file, action = action, form = "unformatted", & + access = "direct", recl = recl) + rec = 0 + call cksum_init() + + do pe_z = 0, NPE(3) - 1 + do z = 0, NZ - 1 + rec = rec + 1 + + if (action == "read") then + read(UCONF, rec = rec) buffer + call cksum_add(buffer, n_bytes) + endif + + do pe_y = 0, NPE(2) - 1 + pe_x = 0 + pe = ilex(DIM, (/pe_x, pe_y, pe_z, pe_t/), NPE) + + if (action == "read") then + call shmem_put(field(1,0,0,z,t), buffer(1, pe_y), size, pe) + else + call shmem_get(buffer(1, pe_y), field(1,0,0,z,t), size, pe) + endif + + enddo + if (action == "write") then + write(UCONF, rec = rec) buffer + call cksum_add(buffer, n_bytes) + endif + enddo + enddo + close(UCONF) + call cksum_get(check_sum(1), check_sum(2)) + if (action == "read") then + if (check_sum(1) /= cksum(t_global)%sum) then + call die("field_io(): check sum error in file " // file) + endif + else + call shmem_put(check_sum_master(1, t_global), check_sum, 2, 0) + endif + enddo + endif + call barrier() + + if (action == "write") then + do t_global = 0, LT - 1 + cksum(t_global)%sum = check_sum_master(1, t_global) + cksum(t_global)%bytes = check_sum_master(2, t_global) + enddo + endif + + call swap_endian8(size_field, field) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_single_pe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_single_pe.F90 new file mode 100644 index 0000000000000000000000000000000000000000..e72e1030314a7218443002bf5e15447eba370efb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_single_pe.F90 @@ -0,0 +1,52 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! field_io_single_pe.F90 - I/O routine for gauge and pseudo fermion fields +! (single processor version) +! +!------------------------------------------------------------------------------- +# define INCLUDE_MPIF_H + +# define MPI_STATUS_SIZE 2 +# define MPI_REAL8 0 +# define mpi_real8 0 +# define MPI_COMM_WORLD 0 +# define MPI_INTEGER8 0 +# define mpi_integer8 0 +# define MPI_ANY_SOURCE 0 + +# include "field_io_mpi.F90" + +!------------------------------------------------------------------------------- +subroutine mpi_type_vector(a, b, c, d, e, f) + return +end + +!------------------------------------------------------------------------------- +subroutine mpi_type_commit(a, b) + return +end + +!------------------------------------------------------------------------------- +subroutine mpi_type_free(a, b) + return +end + +!------------------------------------------------------------------------------- +subroutine mpi_ssend(a, b, c, d, e, f, g) + call die("mpi_ssend(): MPI must not be called in single PE version") +end + +!------------------------------------------------------------------------------- +subroutine mpi_recv(a, b, c, d, e, f, g, h) + call die("mpi_recv(): MPI must not be called in single PE version") +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/pes_mpi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/pes_mpi.F90 new file mode 100644 index 0000000000000000000000000000000000000000..88576ead59b4565ca3aa1703df73539b4617d109 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/pes_mpi.F90 @@ -0,0 +1,33 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! pes_mpi.F90 - MPI version of shmem functions +! +!------------------------------------------------------------------------------- +integer function my_pe() + + implicit none + include 'mpif.h' + integer ierror + + call mpi_comm_rank(MPI_COMM_WORLD, my_pe, ierror) +end + +!------------------------------------------------------------------------------- +integer function num_pes() + + implicit none + include 'mpif.h' + integer ierror + + call mpi_comm_size(MPI_COMM_WORLD, num_pes, ierror) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/pes_single_pe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/pes_single_pe.F90 new file mode 100644 index 0000000000000000000000000000000000000000..ad919b808f4465cd4244abbbebf8c9987ae48195 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/pes_single_pe.F90 @@ -0,0 +1,28 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! pes_single_pe.F90 - dummy routines for shmem functions +! +!------------------------------------------------------------------------------- +integer function my_pe() + my_pe = 0 +end + +!------------------------------------------------------------------------------- +integer function num_pes() + num_pes = 1 +end + +!------------------------------------------------------------------------------- +subroutine barrier() + return +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_mpi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_mpi.F90 new file mode 100644 index 0000000000000000000000000000000000000000..3cd8fe7adb0870c3854ae66ca2d43fd1d2884dca --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_mpi.F90 @@ -0,0 +1,74 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! reduction_mpi.F90 - reduction operations in MPI +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +function global_sum(local_sum) + + implicit none + include 'mpif.h' + REAL global_sum, local_sum + integer ierror + + TIMING_START(timing_bin_global_sum) + + call mpi_allreduce(local_sum, global_sum, 1, & + BQCD_REAL, MPI_SUM, MPI_COMM_WORLD, ierror) + + TIMING_STOP(timing_bin_global_sum) +end + +!------------------------------------------------------------------------------- +function global_min(local_min) + + implicit none + include 'mpif.h' + real global_min, local_min + integer ierror + + call mpi_allreduce(local_min, global_min, 1, & + MPI_REAL, MPI_MIN, MPI_COMM_WORLD, ierror) +end + +!------------------------------------------------------------------------------- +function global_max(local_max) + + implicit none + include 'mpif.h' + real global_max, local_max + integer ierror + + call mpi_allreduce(local_max, global_max, 1, & + MPI_REAL, MPI_MAX, MPI_COMM_WORLD, ierror) +end + +!------------------------------------------------------------------------------- +subroutine global_sum_vec(n, sum) + + implicit none + include 'mpif.h' + integer, intent(in) :: n + REAL, intent(inout) :: sum(n) + REAL :: tmp(n) + integer ierror + + TIMING_START(timing_bin_global_sum_vec) + + tmp = sum + call mpi_allreduce(tmp, sum, n, BQCD_REAL, MPI_SUM, MPI_COMM_WORLD, ierror) + + TIMING_STOP(timing_bin_global_sum_vec) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_shmem.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_shmem.F90 new file mode 100644 index 0000000000000000000000000000000000000000..6fb261cd1ddcc41a6a3616a437b6b19d0cba2227 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_shmem.F90 @@ -0,0 +1,118 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! reduction_shmem.F90 - reduction operations in shmem +! +!------------------------------------------------------------------------------- +# include "defs.h" +# include "shmem.h" + +!------------------------------------------------------------------------------- +function global_sum(local_sum) + + implicit none + include 'mpp/shmem.fh' + + REAL :: global_sum, local_sum + REAL, save :: source, target + integer :: n_pes + integer, external :: num_pes + REAL, save :: pWrk(2 + shmem_reduce_min_wrkdata_size) + integer, save :: pSync(shmem_reduce_sync_size) + + + TIMING_START(timing_bin_global_sum) + + n_pes = num_pes() + + if (n_pes == 1) then + global_sum = local_sum + return + endif + + source = local_sum + + call shmem_real8_sum_to_all(target, source, 1, 0, 0, n_pes, pWrk, pSync) + + global_sum = target + + TIMING_STOP(timing_bin_global_sum) +end + +!------------------------------------------------------------------------------- +function global_min(local_min) + + implicit none + include 'mpp/shmem.fh' + + real :: global_min, local_min + real, save :: source, target + integer :: n_pes + integer, external :: num_pes + real, save :: pWrk(2 + shmem_reduce_min_wrkdata_size) + integer, save :: pSync(shmem_reduce_sync_size) + + n_pes = num_pes() + + if (n_pes == 1) then + global_min = local_min + return + endif + + source = local_min + + call shmem_real8_min_to_all(target, source, 1, 0, 0, n_pes, pWrk, pSync) + + global_min = target +end + +!------------------------------------------------------------------------------- +function global_max(local_max) + + implicit none + include 'mpp/shmem.fh' + + real :: global_max, local_max + real, save :: source, target + integer :: n_pes + integer, external :: num_pes + real, save :: pWrk(2 + shmem_reduce_min_wrkdata_size) + integer, save :: pSync(shmem_reduce_sync_size) + + n_pes = num_pes() + + if (n_pes == 1) then + global_max = local_max + return + endif + + source = local_max + + call shmem_real8_max_to_all(target, source, 1, 0, 0, n_pes, pWrk, pSync) + + global_max = target +end + +!------------------------------------------------------------------------------- +subroutine global_sum_vec(n, sum) + + implicit none + + integer, intent(in) :: n + REAL, intent(inout) :: sum(n) + + TIMING_START(timing_bin_global_sum_vec) + + call die("global_sum_vec(): shmem version not implemented yet.") + + TIMING_STOP(timing_bin_global_sum_vec) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_single_pe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_single_pe.F90 new file mode 100644 index 0000000000000000000000000000000000000000..67bf6f99beb81aa5615da8aaf705be1931fd099d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_single_pe.F90 @@ -0,0 +1,52 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! reduction_single_pe.F90 - reduction operations on a single processor +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +function global_sum(local_sum) + + implicit none + REAL :: global_sum, local_sum + + global_sum = local_sum +end + +!------------------------------------------------------------------------------- +function global_min(local_min) + + implicit none + real :: global_min, local_min + + global_min = local_min +end + +!------------------------------------------------------------------------------- +function global_max(local_max) + + implicit none + real :: global_max, local_max + + global_max = local_max +end + +!------------------------------------------------------------------------------- +subroutine global_sum_vec(n, sum) + + implicit none + integer, intent(in) :: n + REAL, intent(inout) :: sum(n) + + return +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_mpi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_mpi.F90 new file mode 100644 index 0000000000000000000000000000000000000000..762a65cb26987313de6193127f10f4c021f619be --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_mpi.F90 @@ -0,0 +1,48 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! seed_mpi.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine seed_broadcast(seed) + + use module_function_decl + implicit none + include 'mpif.h' + SEED seed + integer ierror + + call mpi_bcast(seed, 1, BQCD_SEED, 0, MPI_COMM_WORLD, ierror) +end + +!------------------------------------------------------------------------------- +subroutine seed_compare(seed) + + use module_function_decl + implicit none + include 'mpif.h' + SEED seed, s + integer pe, status(MPI_STATUS_SIZE), ierror + + if (my_pe() /= 0) then + call mpi_ssend(seed, 1, BQCD_SEED, 0, 0, MPI_COMM_WORLD, ierror) + else + do pe = 1, num_pes() - 1 + call mpi_recv(s, 1, BQCD_SEED, pe, 0, MPI_COMM_WORLD, status, ierror) + if (s /= seed) call die('rancheck(): seeds differ') + enddo + endif + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_shmem.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_shmem.F90 new file mode 100644 index 0000000000000000000000000000000000000000..8d76b5cf36f131a7e1c7c1370fba0306471fe53b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_shmem.F90 @@ -0,0 +1,61 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! seed_shmem.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" +# include "shmem.h" + +!------------------------------------------------------------------------------- +subroutine seed_broadcast(seed) + + use module_function_decl + implicit none + include "mpp/shmem.fh" + + SEED :: seed + SEED, save :: s + integer, save :: psync(SHMEM_BCAST_SYNC_SIZE) + + psync = SHMEM_SYNC_VALUE + s = seed + + call barrier() + call shmem_broadcast(s, s, 1, 0, 0, 0, num_pes(), psync) + call barrier() + + seed = s +end + +!------------------------------------------------------------------------------- +subroutine seed_compare(seed) + + use module_function_decl + implicit none + + SEED :: seed + SEED, save :: s + integer :: pe + + s = seed + call barrier() + + if (my_pe() == 0) then + do pe = 1, num_pes() - 1 + call shmem_get(s, s, 1, pe) + if (s /= seed) call die('rancheck(): seeds differ') + enddo + endif + + call barrier() +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_single_pe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_single_pe.F90 new file mode 100644 index 0000000000000000000000000000000000000000..188f7e7af78b1d2fb0c776784216f8459837456a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_single_pe.F90 @@ -0,0 +1,34 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! seed_single_pe.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine seed_broadcast(seed) + + implicit none + SEED seed + + return +end + +!------------------------------------------------------------------------------- +subroutine seed_compare(seed) + + implicit none + SEED seed + + return +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/shmem.h b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/shmem.h new file mode 100644 index 0000000000000000000000000000000000000000..f6289299e7c9bd847cbb0e3f48eb7f8083dcedbd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/shmem.h @@ -0,0 +1,7 @@ +#ifdef ALTIX +# define barrier shmem_barrier_all +# define shmem_broadcast shmem_broadcast8 +# define shmem_get shmem_get8 +# define shmem_put shmem_put8 +# define shpalloc(addr, length, errcode, abort) shpalloc(addr, 2 * (length), errcode, abort) +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_mpi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_mpi.F90 new file mode 100644 index 0000000000000000000000000000000000000000..51ca5e2eb0e696aa5e62fd8aceae61334a2de5f4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_mpi.F90 @@ -0,0 +1,816 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! xbound_mpi.F90 - boundary exchange with MPI +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_xbound + + implicit none + integer, parameter :: max_bound = 3 * 3 * 3 * 3 + + type type_xbound + integer :: i_source + integer :: i_target + integer :: pe_source + integer :: pe_target + integer :: size ! total size + integer :: vector_type + integer :: block_count + integer :: block_size + integer :: block_stride + end type type_xbound +end + +!------------------------------------------------------------------------------- +module module_xbound_g + + use module_xbound + implicit none + integer, save :: n_bound = 0 + type (type_xbound), save :: b(max_bound) +end + +!------------------------------------------------------------------------------- +module module_xbound_sc + + use module_xbound + implicit none + integer, save :: n_bound = 0 + integer, save :: i_bound(DIM, FWD:BWD) + type (type_xbound), save :: b(max_bound) +end + +!------------------------------------------------------------------------------- +module module_xbound_sc2 + + use module_xbound + implicit none + integer, save :: n_bound = 0 + integer, save :: i_bound(DIM, FWD:BWD) + type (type_xbound), save :: b(max_bound) +end + +!------------------------------------------------------------------------------- +subroutine init_xbound() + + implicit none + integer, external :: version_of_d + + call init_xbound_g() + call init_xbound_sc() + call init_xbound_sc2() + if (version_of_d() == 3) call init_xbound_d3() + if (version_of_d() == 31) call init_xbound_d3() + if (version_of_d() == 4) call init_xbound_d4() +end + +!------------------------------------------------------------------------------- +subroutine init_xbound_g() + + use module_xbound_g + implicit none + + integer :: x, y, z, t + + do t = -1, 1 + do z = -1, 1 + do y = -1, 1 + do x = -1, 1 + call init_xch_bound(n_bound, b, NCOL * NCOL * SIZE_COMPLEX, x, y, z, t) + enddo + enddo + enddo + enddo +end + +!------------------------------------------------------------------------------- +subroutine init_xbound_sc() + + use module_xbound_sc + use module_lattice + implicit none + + integer :: mu, block + + + block = NDIRAC * NCOL * SIZE_COMPLEX + + call init_xch_bound(n_bound, b, block, -1,0,0,0); i_bound(1, BWD) = n_bound + call init_xch_bound(n_bound, b, block, +1,0,0,0); i_bound(1, FWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,-1,0,0); i_bound(2, BWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,+1,0,0); i_bound(2, FWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,0,-1,0); i_bound(3, BWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,0,+1,0); i_bound(3, FWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,0,0,-1); i_bound(4, BWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,0,0,+1); i_bound(4, FWD) = n_bound + + do mu = 1, DIM + if (npe(mu) == 1) then + i_bound(mu, FWD) = 0 + i_bound(mu, BWD) = 0 + endif + enddo +end + +!------------------------------------------------------------------------------- +subroutine init_xbound_sc2() + + use module_xbound_sc2 + use module_lattice + implicit none + + integer :: mu, block + + + block = 2 * NCOL * SIZE_COMPLEX + + call init_xch_bound(n_bound, b, block, -1,0,0,0); i_bound(1, BWD) = n_bound + call init_xch_bound(n_bound, b, block, +1,0,0,0); i_bound(1, FWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,-1,0,0); i_bound(2, BWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,+1,0,0); i_bound(2, FWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,0,-1,0); i_bound(3, BWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,0,+1,0); i_bound(3, FWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,0,0,-1); i_bound(4, BWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,0,0,+1); i_bound(4, FWD) = n_bound + + do mu = 1, DIM + if (npe(mu) == 1) then + i_bound(mu, FWD) = 0 + i_bound(mu, BWD) = 0 + endif + enddo +end + +!------------------------------------------------------------------------------- +subroutine init_xch_bound(n_bound, b, block_size, xx, yy, zz, tt) + + use module_xbound + use module_function_decl + use module_nnpe + use module_offset + use module_lattice + use module_vol + implicit none + include 'mpif.h' + + integer, intent(inout) :: n_bound + type (type_xbound), intent(inout) :: b(max_bound) + integer, intent(in) :: block_size, xx, yy, zz, tt + + integer, dimension (DIM) :: dir, m, i, target, source + integer, external :: xyzt2i, n_sites, e_o + integer :: x, y, z, t, size, mu, stride, block_count, ierror + integer :: tmp_type1, tmp_type2, the_type + integer(MPI_ADDRESS_KIND):: true_lb, true_extent + integer :: extent + + logical :: special + + + if (nnpe(xx, yy, zz, tt) == my_pe()) return + + if (xx /= 0 .and. yy == 0 .and. zz /= 0 .and. tt == 0) then + special = .true. + else + special = .false. + endif + + + dir = (/ xx, yy, zz, tt /) + + do mu = 1, DIM + if (dir(mu) /= 0) then + m(mu) = 1 + else + m(mu) = NH(mu) + endif + + if (dir(mu) == -1) then + target(mu) = -1 + source(mu) = N(mu) - 1 + elseif (dir(mu) == +1) then + target(mu) = N(mu) + source(mu) = 0 + else + target(mu) = 0 + source(mu) = 0 + endif + enddo + + + size = block_size + do mu = 1, DIM + if (dir(mu) == 0) then + size = size * NH(mu) + m(mu) = 1 + else + exit + endif + enddo + + stride = block_size + do mu = 1, DIM + if (m(mu) == 1) then + stride = stride * NH(mu) + else + exit + endif + enddo + + block_count = 1 + do mu = 1, DIM + block_count = block_count * m(mu) + enddo + + n_bound = n_bound + 1 + ASSERT(n_bound <= max_bound) + + if (special) then ! (y,t)-plane + + ! MPY-type for y-line: + + block_count = NY + size = block_size + stride = block_size * NXH + + call mpi_type_vector(block_count, size, stride, BQCD_REAL, tmp_type1, ierror) + call mpi_type_commit(tmp_type1, ierror) + + +#ifdef ALTIX + ! use MPI-1 + call mpi_type_extent(BQCD_REAL, extent, ierror) + call mpi_type_struct(2, (/1, 1/), (/0, extent/), (/tmp_type1, MPI_UB/), & + tmp_type2, ierror) +#else + ! use MPI-2 + call mpi_type_get_true_extent(BQCD_REAL, true_lb, true_extent, ierror) + call mpi_type_create_resized(tmp_type1, true_lb, true_extent, tmp_type2, ierror) +#endif + call mpi_type_commit(tmp_type2, ierror) + + ! MPI-parameters for (y,t)-plane: + + block_count = NT + size = 1 + stride = block_size * NXH * NY * NZ + the_type = tmp_type2 + b(n_bound)%size = block_size * NY * NT + + else + + the_type = BQCD_REAL + b(n_bound)%size = block_count * size + + endif + + + b(n_bound)%i_source = xyzt2i(source) + b(n_bound)%i_target = xyzt2i(target) + b(n_bound)%pe_source = nnpe(xx, yy, zz, tt) + b(n_bound)%pe_target = nnpe(-xx, -yy, -zz, -tt) + + b(n_bound)%block_count = block_count + b(n_bound)%block_size = size + b(n_bound)%block_stride= stride + + call mpi_type_vector(block_count, size, stride, the_type, & + b(n_bound)%vector_type, ierror) + call mpi_type_commit(b(n_bound)%vector_type, ierror) + + !!if ( my_pe() == 0) write(6,*) xx,yy,zz,tt, block_count, size, stride + !!if ( my_pe() == 0) write(6,*) xx,yy,zz,tt, b(n_bound)%i_source, b(n_bound)%i_target, nnpe(xx,yy,zz,tt), my_pe() + + !!ASSERT(b(n_bound)%size == block_size * n_sites(DIM, dir, NH, NPE)) + + if (special) then + call mpi_type_free(tmp_type1, ierror) + call mpi_type_free(tmp_type2, ierror) + endif +end + +!------------------------------------------------------------------------------- +subroutine xbound_g_field(u) + + use module_function_decl + use module_vol + implicit none + + GAUGE_FIELD :: u + integer :: mu, eo, x, y, z, t + + if (num_pes() == 1) return + + do mu = 1, DIM + do eo = EVEN, ODD + call xbound_g(u, eo, mu) + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine xbound_g(u, eo, mu) + + use module_xbound_g + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: eo, mu, i, status(MPI_STATUS_SIZE), ierror + GAUGE_FIELD :: u + + if (num_pes() == 1) return + + do i = 1, n_bound + call mpi_sendrecv( & + u(1,1, b(i)%i_source, eo,mu), 1, b(i)%vector_type, b(i)%pe_target, 0,& + u(1,1, b(i)%i_target, eo,mu), b(i)%size, BQCD_REAL, b(i)%pe_source, 0,& + MPI_COMM_WORLD, status, ierror) + enddo +end + +!------------------------------------------------------------------------------- +subroutine xbound_sc_field(a) + + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: i, status(MPI_STATUS_SIZE), ierror + integer :: mu, fb + SPINCOL_FIELD :: a + + if (num_pes() == 1) return + + do mu = 1, DIM + call xbound_sc(a, mu) + enddo +end + +!------------------------------------------------------------------------------- +subroutine xbound_sc(a, direction) + + use module_xbound_sc + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: i, status(MPI_STATUS_SIZE), ierror + integer :: direction, mu, fb + SPINCOL_FIELD :: a + + if (num_pes() == 1) return + + mu = direction + do fb = FWD, BWD + i = i_bound(mu, fb) + if (i /= 0) then + call mpi_sendrecv( & + a(1,1, b(i)%i_source), 1, b(i)%vector_type, b(i)%pe_target, 0,& + a(1,1, b(i)%i_target), b(i)%size, BQCD_REAL, b(i)%pe_source, 0,& + MPI_COMM_WORLD, status, ierror) + endif + enddo +end + +!------------------------------------------------------------------------------- +subroutine xbound_sc2_field(a) + + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + SC2_FIELD :: a + integer :: mu + + if (num_pes() == 1) return + + do mu = 1, DIM + call xbound_sc2(a, mu) + enddo +end + +!------------------------------------------------------------------------------- +subroutine xbound_sc2_field_i(a) ! "i"mmediate MPI calls + + use module_xbound_sc2 + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + SC2_FIELD :: a + + integer, parameter :: max_request = 2 * 2 * DIM + + integer :: request(max_request), status(MPI_STATUS_SIZE, max_request), ierror + integer :: mu, fb, i, n_request + + if (num_pes() == 1) return + + n_request = 0 + + do mu = 1, DIM + do fb = FWD, BWD + i = i_bound(mu, fb) + if (i /= 0) then + n_request = n_request + 1 + call mpi_irecv( & + a(1,1, b(i)%i_target, mu,fb), b(i)%size, BQCD_REAL, b(i)%pe_source, 0,& + MPI_COMM_WORLD, request(n_request), ierror) + endif + enddo + enddo + + do mu = 1, DIM + do fb = FWD, BWD + i = i_bound(mu, fb) + if (i /= 0) then + n_request = n_request + 1 + call mpi_isend( & + a(1,1, b(i)%i_source, mu,fb), 1, b(i)%vector_type, b(i)%pe_target, 0,& + MPI_COMM_WORLD, request(n_request), ierror) + endif + enddo + enddo + + call mpi_waitall(n_request, request, status, ierror) +end + +!------------------------------------------------------------------------------- +subroutine xbound_sc2(a, direction) + + use module_xbound_sc2 + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: i, status(MPI_STATUS_SIZE), ierror + integer :: direction, mu, fb + SC2_FIELD :: a + + if (num_pes() == 1) return + + mu = direction + do fb = FWD, BWD + i = i_bound(mu, fb) + if (i /= 0) then + call mpi_sendrecv( & + a(1,1, b(i)%i_source, mu,fb), 1, b(i)%vector_type, b(i)%pe_target, 0,& + a(1,1, b(i)%i_target, mu,fb), b(i)%size, BQCD_REAL, b(i)%pe_source, 0,& + MPI_COMM_WORLD, status, ierror) + endif + enddo +end + +!=============================================================================== +! +! new stuff for libd3 +! +!------------------------------------------------------------------------------- +module module_xbound_d3 + + use module_xbound + implicit none + integer, save :: n_bound(DIM) + type (type_xbound), save :: b(2, DIM) + + integer, save :: xch_yf = 0 + integer, save :: xch_yb = 0 + integer, save :: xch_zf = 0 + integer, save :: xch_zb = 0 + integer, save :: xch_tf = 0 + integer, save :: xch_tb = 0 +#ifdef D3_BUFFER_VOL + integer, parameter :: d3_buffer_vol = D3_BUFFER_VOL +#else + integer, parameter :: d3_buffer_vol = 0 +#endif + + type (type_xbound), save :: byf, byb, bzf, bzb, btf, btb + + ! allocate buffer for MPI in static memory to speed-up communication on SR8000 + integer, parameter :: max_buffer = NDIRAC*NCOL*SIZE_COMPLEX*d3_buffer_vol + + REAL, dimension (max_buffer), save :: buffer_yf, buffer_yb, & + buffer_zf, buffer_zb +end + +!------------------------------------------------------------------------------- +integer function get_d3_buffer_vol() + + use module_xbound_d3 + implicit none + + get_d3_buffer_vol = d3_buffer_vol +end + +!------------------------------------------------------------------------------- +subroutine init_xbound_d3() + + use module_lattice + use module_xbound_d3 + implicit none + + if (npe(1) /= 1) call die("init_xbound_d3(): npe(1) /= 1") + + n_bound = 0 + + call init_xch_bound(n_bound(2), b(1,2), NDIRAC*NCOL*SIZE_COMPLEX, 0, 1,0,0) + call init_xch_bound(n_bound(2), b(1,2), NDIRAC*NCOL*SIZE_COMPLEX, 0,-1,0,0) + + call init_xch_bound(n_bound(3), b(1,3), NDIRAC*NCOL*SIZE_COMPLEX, 0,0, 1,0) + call init_xch_bound(n_bound(3), b(1,3), NDIRAC*NCOL*SIZE_COMPLEX, 0,0,-1,0) + + call init_xch_bound(n_bound(4), b(1,4), NDIRAC*NCOL*SIZE_COMPLEX, 0,0,0, 1) + call init_xch_bound(n_bound(4), b(1,4), NDIRAC*NCOL*SIZE_COMPLEX, 0,0,0,-1) + + call init_xch_bound(xch_yf, byf, NDIRAC*NCOL*SIZE_COMPLEX, 0, 1,0,0) + call init_xch_bound(xch_yb, byb, NDIRAC*NCOL*SIZE_COMPLEX, 0,-1,0,0) + + call init_xch_bound(xch_zf, bzf, NDIRAC*NCOL*SIZE_COMPLEX, 0,0, 1,0) + call init_xch_bound(xch_zb, bzb, NDIRAC*NCOL*SIZE_COMPLEX, 0,0,-1,0) + + call init_xch_bound(xch_tf, btf, NDIRAC*NCOL*SIZE_COMPLEX, 0,0,0, 1) + call init_xch_bound(xch_tb, btb, NDIRAC*NCOL*SIZE_COMPLEX, 0,0,0,-1) + + !!if (xch_yf /= 0) allocate(buffer_yf(byf%size)) + !!if (xch_yb /= 0) allocate(buffer_yb(byb%size)) + !!if (xch_zf /= 0) allocate(buffer_zf(bzf%size)) + !!if (xch_zb /= 0) allocate(buffer_zb(bzb%size)) + + if (byf%size > max_buffer) call die("init_xbound_d3(): byf%size") + if (byb%size > max_buffer) call die("init_xbound_d3(): byb%size") + if (bzf%size > max_buffer) call die("init_xbound_d3(): bzf%size") + if (bzb%size > max_buffer) call die("init_xbound_d3(): bzb%size") + + if (xch_yf /= xch_yb) call die("init_xbound_d3(): xch_y") + if (xch_zf /= xch_zb) call die("init_xbound_d3(): xch_z") + if (byf%block_size /= byb%block_size) call die("init_xbound_d3(): size") + if (byf%block_count /= byb%block_count) call die("init_xbound_d3(): count") + if (byf%block_stride /= byb%block_stride) call die("init_xbound_d3(): stride") +end + +!------------------------------------------------------------------------------- +subroutine xbound_d3(a, direction) + + use module_xbound_d3 + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: i, status(MPI_STATUS_SIZE), ierror, direction, d + SPINCOL_FIELD :: a + + if (num_pes() == 1) return + + d = direction + + do i = 1, n_bound(d) + call mpi_sendrecv( & + a(1, 1, b(i,d)%i_source), 1, b(i,d)%vector_type, b(i,d)%pe_target,0,& + a(1, 1, b(i,d)%i_target), b(i,d)%size, BQCD_REAL, b(i,d)%pe_source,0,& + MPI_COMM_WORLD, status, ierror) + enddo +end + +!------------------------------------------------------------------------------- +subroutine xbound_yf(a) + + use module_xbound_d3 + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: i, status(MPI_STATUS_SIZE), ierror + SPINCOL_FIELD :: a + + if (num_pes() == 1 .or. xch_yf == 0) return + + call mpi_sendrecv( & + a(1, 1, byf%i_source), 1, byf%vector_type, byf%pe_target,1,& + a(1, 1, byf%i_target), byf%size, BQCD_REAL, byf%pe_source,1,& + MPI_COMM_WORLD, status, ierror) +end + +!------------------------------------------------------------------------------- +subroutine xbound_yb(a) + + use module_xbound_d3 + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: i, status(MPI_STATUS_SIZE), ierror + SPINCOL_FIELD :: a + + if (num_pes() == 1 .or. xch_yb == 0) return + + call mpi_sendrecv( & + a(1, 1, byb%i_source), 1, byb%vector_type, byb%pe_target,2,& + a(1, 1, byb%i_target), byb%size, BQCD_REAL, byb%pe_source,2,& + MPI_COMM_WORLD, status, ierror) +end + +!------------------------------------------------------------------------------- +subroutine xbound_fill_buffer_y(a) + + use module_xbound_d3 + use module_function_decl + use module_vol + implicit none + + REAL :: a(*) + integer :: i, j, off_af, off_ab, off_b + integer :: count, size, stride, start_af, start_ab + + if (num_pes() == 1 .or. xch_yf == 0) return + + !$omp parallel private(i, j, off_af, off_ab, off_b, start_af, start_ab, & + !$omp count, size, stride) + start_af = (byf%i_source - 1) * NDIRAC * NCOL * SIZE_COMPLEX + 1 + start_ab = (byb%i_source - 1) * NDIRAC * NCOL * SIZE_COMPLEX + 1 + + count = byf%block_count + size = byf%block_size + stride = byf%block_stride + + !$omp do + do i = 0, count - 1 + off_af = start_af + i * stride + off_ab = start_ab + i * stride + off_b = i * size + 1 + do j = 0, size - 1 + buffer_yf(off_b + j) = a(off_af + j) + buffer_yb(off_b + j) = a(off_ab + j) + enddo + enddo + !$omp end parallel +end + +!------------------------------------------------------------------------------- +subroutine xbound_copy_buffer_y(a) + + use module_xbound_d3 + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: status(MPI_STATUS_SIZE), ierror + SPINCOL_FIELD :: a + + if (num_pes() == 1 .or. xch_yf == 0) return + + call mpi_sendrecv( & + buffer_yf(1), byf%size, BQCD_REAL, byf%pe_target, 0,& + a(1, 1, byf%i_target), byf%size, BQCD_REAL, byf%pe_source, 0,& + MPI_COMM_WORLD, status, ierror) + + call mpi_sendrecv( & + buffer_yb(1), byb%size, BQCD_REAL, byb%pe_target, 0,& + a(1, 1, byb%i_target), byb%size, BQCD_REAL, byb%pe_source, 0,& + MPI_COMM_WORLD, status, ierror) + +end + +!------------------------------------------------------------------------------- +subroutine xbound_fill_buffer_z(a) + + use module_xbound_d3 + use module_function_decl + use module_vol + implicit none + + REAL :: a(*) + integer :: i, j, off_af, off_ab, off_b + integer :: count, size, stride, start_af, start_ab + + if (num_pes() == 1 .or. xch_zf == 0) return + + !$omp parallel private(i, j, off_af, off_ab, off_b, start_af, start_ab, & + !$omp count, size, stride) + start_af = (bzf%i_source - 1) * NDIRAC * NCOL * SIZE_COMPLEX + 1 + start_ab = (bzb%i_source - 1) * NDIRAC * NCOL * SIZE_COMPLEX + 1 + + count = bzf%block_count + size = bzf%block_size + stride = bzf%block_stride + + !$omp do + do i = 0, count - 1 + off_af = start_af + i * stride + off_ab = start_ab + i * stride + off_b = i * size + 1 + do j = 0, size - 1 + buffer_zf(off_b + j) = a(off_af + j) + buffer_zb(off_b + j) = a(off_ab + j) + enddo + enddo + !$omp end parallel +end + +!------------------------------------------------------------------------------- +subroutine xbound_copy_buffer_z(a) + + use module_xbound_d3 + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: status(MPI_STATUS_SIZE), ierror + SPINCOL_FIELD :: a + + if (num_pes() == 1 .or. xch_zf == 0) return + + call mpi_sendrecv( & + buffer_zf(1), bzf%size, BQCD_REAL, bzf%pe_target, 0,& + a(1, 1, bzf%i_target), bzf%size, BQCD_REAL, bzf%pe_source, 0,& + MPI_COMM_WORLD, status, ierror) + + call mpi_sendrecv( & + buffer_zb(1), bzb%size, BQCD_REAL, bzb%pe_target, 0,& + a(1, 1, bzb%i_target), bzb%size, BQCD_REAL, bzb%pe_source, 0,& + MPI_COMM_WORLD, status, ierror) + +end + +!=============================================================================== +! +! stuff for lib_d4 +! +!------------------------------------------------------------------------------- +subroutine init_xbound_d4() + + use module_lattice + implicit none + + if (npe(1) /= 1) call die("init_xbound_d4(): npe(1) /= 1") + if (npe(2) /= 1) call die("init_xbound_d4(): npe(2) /= 1") + if (npe(3) /= 1) call die("init_xbound_d4(): npe(3) /= 1") + + call init_xbound_d3() +end + +!------------------------------------------------------------------------------- +subroutine xbound_tf(a) + + use module_xbound_d3 + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: i, status(MPI_STATUS_SIZE), ierror + SPINCOL_FIELD :: a + + if (num_pes() == 1 .or. xch_tf == 0) return + + call mpi_sendrecv( & + a(1, 1, btf%i_source), 1, btf%vector_type, btf%pe_target,1,& + a(1, 1, btf%i_target), btf%size, BQCD_REAL, btf%pe_source,1,& + MPI_COMM_WORLD, status, ierror) +end + +!------------------------------------------------------------------------------- +subroutine xbound_tb(a) + + use module_xbound_d3 + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: i, status(MPI_STATUS_SIZE), ierror + SPINCOL_FIELD :: a + + if (num_pes() == 1 .or. xch_tb == 0) return + + call mpi_sendrecv( & + a(1, 1, btb%i_source), 1, btb%vector_type, btb%pe_target,2,& + a(1, 1, btb%i_target), btb%size, BQCD_REAL, btb%pe_source,2,& + MPI_COMM_WORLD, status, ierror) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_shmem.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_shmem.F90 new file mode 100644 index 0000000000000000000000000000000000000000..3cff78a725a096e8d19e47bd531e6789c347f6bd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_shmem.F90 @@ -0,0 +1,208 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! xbound_shmem.F90 - boundary exchange with shmem +! +!------------------------------------------------------------------------------- +# include "defs.h" +# include "shmem.h" + +!------------------------------------------------------------------------------- +subroutine init_xbound() + + return +end + +!------------------------------------------------------------------------------- +subroutine xbound_g(u, eo, mu) + + use module_function_decl + use module_vol + implicit none + + integer :: eo, mu, x, y, z, t + GAUGE_FIELD :: u + + if (num_pes() == 1) return + + call barrier() + + do t = -1, 1 + do z = -1, 1 + do y = -1, 1 + do x = -1, 1 + call xch_bound(NCOL * NCOL * SIZE_COMPLEX, u(1, 1, 1, eo, mu), x, y, z, t) + call barrier() + enddo + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine xbound_g_field(u) + + use module_function_decl + use module_vol + implicit none + + GAUGE_FIELD :: u + integer :: mu, eo, x, y, z, t + + if (num_pes() == 1) return + + call barrier() + + do mu = 1, DIM + do eo = EVEN, ODD + call xbound_g(u, eo, mu) + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine xbound_sc_field(array) + + use module_function_decl + use module_vol + implicit none + + SPINCOL_FIELD :: array + integer :: x, y, z, t + + if (num_pes() == 1) return + + call barrier() + + do t = -1, 1 + do z = -1, 1 + do y = -1, 1 + do x = -1, 1 + if ((abs(x) + abs(y) + abs(z) + abs(t)) == 1) then + call xch_bound(NDIRAC * NCOL * SIZE_COMPLEX, array, x, y, z, t) + call barrier() + endif + enddo + enddo + enddo + enddo +end + +!------------------------------------------------------------------------------- +subroutine xbound_sc2_field_i(a) + + use module_function_decl + use module_lattice + use module_vol + implicit none + + SC2_FIELD :: a + integer :: x, y, z, t + + if (num_pes() == 1) return + + call barrier() + + call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 1, FWD), +1, 0, 0, 0) + call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 1, BWD), -1, 0, 0, 0) + call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 2, FWD), 0, +1, 0, 0) + call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 2, BWD), 0, -1, 0, 0) + call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 3, FWD), 0, 0, +1, 0) + call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 3, BWD), 0, 0, -1, 0) + call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 4, FWD), 0, 0, 0, +1) + call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 4, BWD), 0, 0, 0, -1) + + call barrier() + +end + +!------------------------------------------------------------------------------- +subroutine xch_bound(mm, array, xx, yy, zz, tt) + + use module_function_decl + use module_nnpe + use module_offset + use module_lattice + use module_vol + implicit none + include 'mpif.h' + + integer :: mm + REAL, dimension (mm, volh_tot) :: array + integer, dimension (DIM) :: dir, m, i, target, source + integer, external :: xyzt2i + integer :: xx, yy, zz, tt, x, y, z, t, pe, size, mu + + pe = nnpe(xx, yy, zz, tt) + if (pe == my_pe()) return + + dir = (/ xx, yy, zz, tt /) + + do mu = 1, DIM + if (dir(mu) /= 0) then + m(mu) = 1 + else + m(mu) = NH(mu) + endif + enddo + + size = mm + if (dir(1) == 0) then + size = size * NXH + m(1) = 1 + if (dir(2) == 0) then + size = size * N(2) + m(2) = 1 + if (dir(3) == 0) then + size = size * N(3) + m(3) = 1 + if (dir(4) == 0) then + size = size * N(4) + m(4) = 1 + endif + endif + endif + endif + + do t = 0, m(4) - 1 + do z = 0, m(3) - 1 + do y = 0, m(2) - 1 + do x = 0, m(1) - 1 + + i = (/ x, y, z, t /) + + do mu = 1, DIM + if (dir(mu) == -1) then + target(mu) = -1 + source(mu) = N(mu) - 1 + elseif (dir(mu) == +1) then + target(mu) = N(mu) + source(mu) = 0 + else + target(mu) = i(mu) + source(mu) = i(mu) + endif + enddo + +!!! call shmem_get(array(1, xyzt2i(target)), & +!!! array(1, xyzt2i(source)), size, pe) + call shmem_put(array(1, xyzt2i(target)), & + array(1, xyzt2i(source)), size, nnpe(-xx,-yy,-zz,-tt)) + + enddo + enddo + enddo + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_single_pe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_single_pe.F90 new file mode 100644 index 0000000000000000000000000000000000000000..5fab9e1eeb57377b0ef2d1ecbd9de848a5a7fc3d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_single_pe.F90 @@ -0,0 +1,72 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! xbound_single_pe.F90 - dummy routines for boundary exchange +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine init_xbound() + return +end + +!------------------------------------------------------------------------------- +subroutine xbound_g(u, eo, mu) + + use module_vol + implicit none + integer :: eo, mu + GAUGE_FIELD :: u + + return +end + +!------------------------------------------------------------------------------- +subroutine xbound_g_field(u) + + use module_vol + implicit none + GAUGE_FIELD :: u + + return +end + +!------------------------------------------------------------------------------- +subroutine xbound_sc_field(array) + + use module_vol + implicit none + SPINCOL_FIELD :: array + + return +end + +!------------------------------------------------------------------------------- +subroutine xbound_sc2_field(array) + + use module_vol + implicit none + SC2_FIELD :: array + + return +end + +!------------------------------------------------------------------------------- +subroutine xbound_sc2_field_i(array) + + use module_vol + implicit none + SC2_FIELD :: array + + return +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_test.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_test.F90 new file mode 100644 index 0000000000000000000000000000000000000000..f76c76cbc1fb12d5daa003f5e98b97ac16425a47 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_test.F90 @@ -0,0 +1,177 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! xbound_test.F90 - test of xbound_g() +! all possible dimensions must be decomposed +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine xbound_test() + + use module_function_decl + use module_lattice + use module_vol + implicit none + GAUGE_FIELD :: u, v + integer :: x, y, z, t, j(4), i, eo, global(4) + integer :: is_bound_x, is_bound_y, is_bound_z, is_bound_t + integer :: dx, dy, dz, dt + integer, external :: xyzt2i, e_o + character(16) :: status + +!! call conf_zero(u) +!! +!! do i = 1, volh +!! u(1, 1, i, EVEN, 1) = 123 +!! u(1, 1, i, ODD, 1) = 789 +!! enddo +!! +!! call xbound_g(u, EVEN, 1) +!! call xbound_g(u, ODD, 1) +!! +!! do i = 1, volh_tot +!! ASSERT(u(1, 1, i, EVEN, 1) == 123) +!! ASSERT(u(1, 1, i, ODD, 1) == 789) +!! enddo +!! +!!!----------------------------------------------- + call conf_zero(u) + + u = cmplx(12345.0, 67890.0) + + call open_diag() + + do t = 0, NT - 1 + do z = 0, NZ - 1 + do y = 0, NY - 1 + do x = 0, NX - 1 + j = (/x, y, z, t/) + + i = xyzt2i(j) + eo = e_o(j) + + call local2global(my_pe(), j, global) + + u(1, 1, i, eo, 1) = global(1) + u(2, 2, i, eo, 1) = global(2) + u(3, 3, i, eo, 1) = global(3) + u(1, 2, i, eo, 1) = global(4) + + !!write(UDIAG, "(4i6, 2i8)") j, i, eo + + !!write(UDIAG, "(10i6)") j, global, i, eo + enddo + enddo + enddo + enddo + + call xbound_g(u, EVEN, 1) + call xbound_g(u, ODD, 1) + + write(UDIAG,*) + write(UDIAG,*) + + !!ASSERT(e_o((/0,0,0,0/)) == 0) + + + do i = 1, volh_tot + do eo = EVEN, ODD + write(UDIAG, "(a,2i6,4f8.1)") "alles: ", i, eo, & + real(u(1, 1, i, eo, 1)), & + real(u(2, 2, i, eo, 1)), & + real(u(3, 3, i, eo, 1)), & + real(u(1, 2, i, eo, 1)) + enddo + enddo + + write(UDIAG,*) + write(UDIAG,*) + + do t = -1, NT + is_bound_t = 0 + if (t == -1) is_bound_t = 1 + if (t == NT) is_bound_t = 1 + do z = -1, NZ + is_bound_z = 0 + if (z == -1) is_bound_z = 1 + if (z == NZ) is_bound_z = 1 + do y = -1, NY + is_bound_y = 0 + if (y == -1) is_bound_y = 1 + if (y == NY) is_bound_y = 1 + do x = -1, NX + is_bound_x = 0 + if (x == -1) is_bound_x = 1 + if (x == NX) is_bound_x = 1 + + if (is_bound_x + is_bound_y + is_bound_z + is_bound_t <= 2) then + + +!! do x = NX, NX + + j = (/x, y, z, t/) + j = (/x, y, z, t/) + + i = xyzt2i(j) + eo = e_o(j) + + call local2global(my_pe(), j, global) + + dx = -is_bound_x; if (x == NX) dx = 1 + dy = -is_bound_y; if (y == NY) dy = 1 + dz = -is_bound_z; if (z == NZ) dz = 1 + dt = -is_bound_t; if (t == NT) dt = 1 + + if (u(1, 1, i, eo, 1) == global(1) .and. & + u(2, 2, i, eo, 1) == global(2) .and. & + u(3, 3, i, eo, 1) == global(3) .and. & + u(1, 2, i, eo, 1) == global(4)) then + status = " okay" + !!elseif (u(1, 1, i, eo, 1) == global(1) + dx .and. & + !! u(2, 2, i, eo, 1) == global(2) + dy .and. & + !! u(3, 3, i, eo, 1) == global(3) + dz .and. & + !! u(1, 2, i, eo, 1) == global(4) + dt) then + !! status = " okay2" + else + status = "" + + dx = int(u(1, 1, i, eo, 1)) - global(1) + dy = int(u(2, 2, i, eo, 1)) - global(2) + dz = int(u(3, 3, i, eo, 1)) - global(3) + dt = int(u(1, 2, i, eo, 1)) - global(4) + + write(status,"(a,4i3,a)") " (", dx, dy, dz, dt, ")" + endif + + + !!if (eo == 0) write(UDIAG, "(4i6, 2x, 4i6, i8, 2i3)") j, i, eo + !write(UDIAG, "(10i6)") j, global, i, eo + !!ASSERT(eo == mod(4+x+y+z+t, 2)) + + write(UDIAG, "(10i6,4f8.1,a)") j, global, i, eo, & + real(u(1, 1, i, eo, 1)), & + real(u(2, 2, i, eo, 1)), & + real(u(3, 3, i, eo, 1)), & + real(u(1, 2, i, eo, 1)), status + + ASSERT(u(1, 1, i, eo, 1) == global(1)) + ASSERT(u(2, 2, i, eo, 1) == global(2)) + ASSERT(u(3, 3, i, eo, 1) == global(3)) + ASSERT(u(1, 2, i, eo, 1) == global(4)) + + endif + enddo + enddo + enddo + enddo + +end diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/conf.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/conf.F90 new file mode 100644 index 0000000000000000000000000000000000000000..94f937766b843e369c832555630c8e04fe1cb717 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/conf.F90 @@ -0,0 +1,461 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! conf.F90 - operations on gauge field and pseudo fermion field configurations +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine init_confs(para, conf) + + use typedef_para + use module_p_interface + use module_switches + implicit none + + type(type_para) :: para + type(hmc_conf), dimension(MAX_TEMPER) :: conf + integer :: i + + do i = 1, para%n_temper + call allocate_g_field(conf(i)%u) + call allocate_sc_field(conf(i)%phi) + if (switches%hasenbusch) call allocate_sc_field(conf(i)%phi2) + if (para%start == START_HOT .or. para%start == START_COLD) then + call init_u(conf(i)%u, para%start) + conf(i)%former = i + endif + enddo + + if (para%start == START_CONT) call conf_read(.true., para, conf) + if (para%start == START_FILE) call conf_read(.false., para, conf) + + do i = 1, para%n_temper + if (para%hmc(i)%csw_kappa /= ZERO) then + call allocate_clover_field_a(conf(i)%a) + call allocate_clover_field_a(conf(i)%i) + call allocate_clover_field_b(conf(i)%b) + call clover_init(conf(i)%a, conf(i)%i, conf(i)%b, & + conf(i)%u, para%hmc(i)%csw_kappa) + endif + enddo + +end + +!------------------------------------------------------------------------------- +subroutine init_u(u, start) ! initialization of u-field (at trajectory 0) + + use module_vol + implicit none + + GAUGE_FIELD :: u + integer :: start + + select case (start) + case (START_HOT) + call conf_hot(u) + case (START_COLD) + call conf_cold(u) + case default + call die("init_u(): don't know how to start") + end select + + call xbound_g_field(u) +end + +!------------------------------------------------------------------------------- +subroutine conf_check(u) ! checks if u-field is SU(3) + + use module_vol + implicit none + + GAUGE_FIELD, intent(in) :: u + SU3 :: v + SU3, parameter :: su3_one = reshape( & + (/ ONE,ZERO,ZERO, & + ZERO,ONE,ZERO, & + ZERO,ZERO,ONE /), & + (/ NCOL, NCOL /)) + REAL :: dev, d + integer :: i, j, k, eo, mu + + + dev = ZERO + do mu = 1, DIM + do eo = EVEN, ODD + do k = 1, VOLH + call su3_check_det(u(1, 1, k, eo, mu)) + call uud(v, u(1, 1, k, eo, mu), u(1, 1, k, eo, mu)) + do i = 1, NCOL + do j = 1, NCOL + d = abs(Re(v(i, j)) - Re(su3_one(i, j))) & + + abs(Im(v(i, j)) - Im(su3_one(i, j))) + enddo + enddo + dev = max(dev, d) + enddo + enddo + enddo + + if (dev > 1e-13) call die('conf_check(): dev > 1e-13') +!!write(0,'(x,a,e10.2)') 'conf_check(): max deviation is ', dev + +end + +!------------------------------------------------------------------------------- +subroutine conf_normalize(u) ! normalizes u-field to SU(3) + + use module_vol + implicit none + + GAUGE_FIELD, intent(inout) :: u + integer :: i, eo, mu + + do mu = 1, DIM + do eo = EVEN, ODD + do i = 1, volh + call u_normalize(u(1, 1, i, eo, mu)) + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine conf_zero(u) ! init ("OpenMP first touch") + + use module_vol + implicit none + + GAUGE_FIELD :: u + integer :: i, eo, mu + + do mu = 1, DIM + do eo = EVEN, ODD + !$omp parallel do + do i = 1, volh + u(1, 1, i, eo, mu) = ZERO + u(2, 1, i, eo, mu) = ZERO + u(3, 1, i, eo, mu) = ZERO + u(1, 2, i, eo, mu) = ZERO + u(2, 2, i, eo, mu) = ZERO + u(3, 2, i, eo, mu) = ZERO + u(1, 3, i, eo, mu) = ZERO + u(2, 3, i, eo, mu) = ZERO + u(3, 3, i, eo, mu) = ZERO + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine conf_cold(u) ! cold start + + use module_vol + implicit none + + GAUGE_FIELD :: u + integer :: i, eo, mu + + do mu = 1, DIM + do eo = EVEN, ODD + do i = 1, volh + u(1, 1, i, eo, mu) = ONE + u(2, 1, i, eo, mu) = ZERO + u(3, 1, i, eo, mu) = ZERO + u(1, 2, i, eo, mu) = ZERO + u(2, 2, i, eo, mu) = ONE + u(3, 2, i, eo, mu) = ZERO + u(1, 3, i, eo, mu) = ZERO + u(2, 3, i, eo, mu) = ZERO + u(3, 3, i, eo, mu) = ONE + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine conf_hot(u) ! hot start + + use module_vol + implicit none + + GAUGE_FIELD :: u + integer :: i, eo, mu + + do mu = 1, DIM + do eo = EVEN, ODD + call ran_gauss_volh(NCOL * NCOL, u(1, 1, 1, eo, mu), ONE, eo) + do i = 1, volh + call u_normalize(u(1, 1, i, eo, mu)) + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine conf_seq(action, u, u_io) ! arranges u-field for i/o + + use module_lattice_io + use module_decomp + use module_vol + implicit none + + character(*) :: action + GAUGE_FIELD :: u + GAUGE_FIELD_IO :: u_io + + integer, dimension(DIM) :: j + integer :: x, y, z, t, i, eo, mu, c1, c2 + integer, external :: std_xyzt2i, e_o + + do t = 0, NT - 1 + do z = 0, NZ - 1 + do y = 0, NY - 1 + do x = 0, NX - 1 + + j = (/x, y, z, t/) + + i = std_xyzt2i(j) + eo = e_o(j) + + do mu = 1, DIM + do c2 = 1, NCOL - 1 + do c1 = 1, NCOL + if (action == "read") then + u(c1, c2, i, eo, mu) = u_io(c1, c2, mu, x, y, z, t) + else + u_io(c1, c2, mu, x, y, z, t) = u(c1, c2, i, eo, mu) + endif + enddo + enddo + if (action == "read") call u_complete(u(1, 1, i, eo ,mu)) + enddo + enddo + enddo + enddo + enddo +end + +!------------------------------------------------------------------------------- +subroutine phi_seq(action, phi, phi_io) ! arranges phi-field for i/o + + use module_lattice_io + use module_decomp + use module_vol + implicit none + + character(*) :: action + SPINCOL_FIELD :: phi + SPINCOL_FIELD_IO :: phi_io + + integer, dimension(DIM) :: j + integer :: x, y, z, t, d, c, i + integer, external :: std_xyzt2i, e_o + + do t = 0, NT - 1 + do z = 0, NZ - 1 + do y = 0, NY - 1 + do x = 0, NX - 1 + + j = (/x, y, z, t/) + + if (e_o(j) == EVEN) then + i = std_xyzt2i(j) + do c = 1, NCOL + do d = 1, NDIRAC + if (action == "read") then + phi(d, c, i) = phi_io(d, c, x, y, z, t) + else + phi_io(d, c, x, y, z, t) = phi(d, c, i) + endif + enddo + enddo + endif + + enddo + enddo + enddo + enddo +end + +!------------------------------------------------------------------------------- +subroutine conf_read(restart, para, conf) + + use typedef_cksum + use typedef_para + use module_conf_info + use module_lattice_io + use module_p_interface + use module_switches + use module_vol + implicit none + + character(len = *), parameter :: READ = "read" + + logical :: restart + type(type_para) :: para + type(hmc_conf), dimension(MAX_TEMPER) :: conf + + type(type_conf_info) :: info + type(type_cksum), dimension(0:para%L(4)-1) :: cksum + P_GAUGE_FIELD_IO, save :: u_io + P_SPINCOL_FIELD_IO, save :: phi_io + FILENAME, external :: u_file, phi_file, info_file + FILENAME :: file + integer :: i, t + integer :: u_m, u_mx, phi_m, phi_mx + integer :: n_u_io, n_phi + + ALLOCATE_G_FIELD_IO(u_io) + ALLOCATE_SC_FIELD_IO(phi_io) + + u_m = NCOL * (NCOL - 1) * DIM + u_mx = NX + phi_m = NDIRAC * NCOL + phi_mx = NXH + n_u_io = u_m * vol * SIZE_COMPLEX + n_phi = size_sc_field + + do i = 1, para%n_temper + + if (restart) then + file = info_file(i) + else + file = para%info_file(i) + endif + + open(UINFO, file = file, action = READ, status = "old") + call read_conf_info_header(UINFO, info) + call check_conf_info_header(restart, info, para) + + if (restart) then + conf(i)%former = info%ensemble(2) + else + conf(i)%former = i + endif + + ! read U + + call read_cksum(restart, UINFO, cksum, para%L(4), i, u_file) + + TIMING_START(timing_bin_u_read) + call field_io(READ, u_m, u_mx, u_io, cksum) + TIMING_STOP(timing_bin_u_read) + + call conf_seq(READ, conf(i)%u, u_io) + call xbound_g_field(conf(i)%u) + + if (switches%tempering .and. switches%dynamical) then + ! read PHI + call read_cksum(restart, UINFO, cksum, para%L(4), i, phi_file) + call field_io(READ, phi_m, phi_mx, phi_io, cksum) + call phi_seq(READ, conf(i)%phi, phi_io) + call xbound_sc_field(conf(i)%phi) + endif + + close(UINFO) + enddo +end + +!------------------------------------------------------------------------------- +subroutine conf_write(restart, para, conf) + + use typedef_cksum + use typedef_para + use module_function_decl + use module_lattice_io + use module_p_interface + use module_switches + use module_vol + implicit none + + character(len = *), parameter :: WRITE = "write" + + logical :: restart + type(type_para) :: para + type(hmc_conf), dimension(MAX_TEMPER) :: conf + + type(type_cksum), dimension(0:para%L(4)-1) :: cksum + P_GAUGE_FIELD_IO, save :: u_io + P_SPINCOL_FIELD_IO, save :: phi_io + FILENAME, external :: u_file, phi_file, info_file + FILENAME, external :: conf_file, conf_info_file + FILENAME :: f_info + integer :: i, j, t + integer :: u_m, u_mx, phi_m, phi_mx + integer :: n_u_io, n_phi + REAL :: plaq + REAL, external :: sg + + + ALLOCATE_G_FIELD_IO(u_io) + ALLOCATE_SC_FIELD_IO(phi_io) + + u_m = NCOL * (NCOL - 1) * DIM + u_mx = NX + phi_m = NDIRAC * NCOL + phi_mx = NXH + n_u_io = u_m * vol * SIZE_COMPLEX + n_phi = size_sc_field + + call check_former(para%n_temper, conf) + + do i = 1, para%n_temper + + j = conf(i)%former + + if (restart) then + f_info = info_file(i) + else + f_info = conf_info_file(i, j) + endif + + if (my_pe() == 0) open(UINFO, file = f_info, action = WRITE) + plaq = sg(conf(i)%u) / (SIX * volume) + call write_conf_info_header(para, i, j, plaq) + + ! write U + + do t = 0, para%L(4) - 1 + if (restart) then + cksum(t)%file = u_file(i, t) + else + cksum(t)%file = conf_file(i, j, t) + endif + enddo + + call conf_seq(WRITE, conf(i)%u, u_io) + + TIMING_START(timing_bin_u_write) + call field_io(WRITE, u_m, u_mx, u_io, cksum) + TIMING_STOP(timing_bin_u_write) + + call write_cksum(UINFO, cksum, para%L(4)) + + if (switches%tempering .and. switches%dynamical .and. restart) then + ! write PHI + do t = 0, para%L(4) - 1 + cksum(t)%file = phi_file(i, t) + enddo + call phi_seq(WRITE, conf(i)%phi, phi_io) + call field_io(WRITE, phi_m, phi_mx, phi_io, cksum) + call write_cksum(UINFO, cksum, para%L(4)) + endif + + if (my_pe() == 0) close(UINFO) + enddo +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/conf_info.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/conf_info.F90 new file mode 100644 index 0000000000000000000000000000000000000000..a2b227a4b3b5b5753920c2ec33064f99a3640cb7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/conf_info.F90 @@ -0,0 +1,200 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! conf_info.F90 - read/write/check file containing configuration parameters +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine write_conf_info_header(para, i_ensemble1, i_ensemble2, plaq) + + use module_conf_info + use typedef_para + use module_bqcd + use module_counter + use module_decomp + use module_function_decl + implicit none + + type(type_para) :: para + REAL :: plaq + integer :: i_ensemble1, i_ensemble2 + integer :: i, e(2) + + e(1) = i_ensemble1 + e(2) = i_ensemble2 + + if (my_pe() == 0) then + call begin(UINFO, "ConfInfoHeader") + write(UINFO, *) k_format, conf_info_version + write(UINFO, 400) k_prog, prog_name, prog_version + write(UINFO, *) k_run, para%run + write(UINFO, *) k_traj, counter%traj + write(UINFO, 405) k_host, rechner() + write(UINFO, 400) k_date, datum(), uhrzeit() + write(UINFO, 410) k_L, decomp%std%L + write(UINFO, 410) k_bc, decomp%std%bc_fermions + write(UINFO, *) k_rkind, RKIND + write(UINFO, 420) k_plaq, plaq + + do i = 1, 2 + write(UINFO, *) trim(k_ensemble(i)), e(i) + write(UINFO, 405) trim(k_beta(i)), trim(para%c_hmc(e(i))%beta) + write(UINFO, 405) trim(k_kappa(i)), trim(para%c_hmc(e(i))%kappa) + write(UINFO, 405) trim(k_csw(i)), trim(para%c_hmc(e(i))%csw) + write(UINFO, 405) trim(k_csw_kappa(i)),trim(para%c_hmc(e(i))%csw_kappa) + write(UINFO, 405) trim(k_h(i)), trim(para%c_hmc(e(i))%h) + enddo + call end_A(UINFO, "ConfInfoHeader") + endif + +400 format (3(1x,a)) +405 format (2(1x,a)) +410 format (1x,a,4i3) +420 format (1x,a,1x,e25.14) +end + +!------------------------------------------------------------------------------- +subroutine read_conf_info_header(unit, info) + + use module_bqcd + use module_conf_info + implicit none + + type(type_conf_info) :: info + integer :: unit, v, i + integer, external :: pos_keyword + + call read_keyword_int(unit, k_format, v, 1) + + if (v /= conf_info_version) then + call die("read_conf_info_header(): wrong file format") + endif + + call read_keyword_int(unit, k_L, info%L, DIM) + call read_keyword_int(unit, k_bc, info%bc_fermions, DIM) + call read_keyword_int(unit, k_rkind, info%rkind, 1) + + do i = 1, 2 + call read_keyword_int (unit, k_ensemble(i), info%ensemble(i), 1) + call read_keyword_REAL(unit, k_beta(i), info%beta(i), 1) + call read_keyword_REAL(unit, k_kappa(i), info%kappa(i), 1) + call read_keyword_REAL(unit, k_csw(i), info%csw(i), 1) + call read_keyword_REAL(unit, k_csw_kappa(i),info%csw_kappa(i),1) + call read_keyword_REAL(unit, k_h(i), info%h(i), 1) + enddo + +end + +!------------------------------------------------------------------------------- +subroutine check_conf_info_header(restart, info, para) + + use module_conf_info + use module_decomp + use typedef_para + implicit none + + logical :: restart + type(type_conf_info) :: info + type(type_para) :: para + integer :: mu, i + + if (info%rkind /= RKIND) call die("check_conf_info_header(): RKIND wrong") + + do mu = 1, DIM + if (info%L(mu) /= decomp%std%L(mu)) then + call die("check_conf_info_header(): L inconsistent") + endif + enddo + + if (restart) then + do mu = 1, DIM + if (info%bc_fermions(mu) /= decomp%std%bc_fermions(mu)) then + call die("check_conf_info_header(): bc_fermions inconsistent") + endif + enddo + + do i = 1, 2 + if (info%ensemble(i) < 1 .or. info%ensemble(i) > para%n_temper) then + call die("check_conf_info_header(): i_ensemble out of range") + endif + + if (info%beta(i) /= para%hmc(info%ensemble(i))%beta) then + call die("check_conf_info_header(): beta inconsistent") + endif + + if (info%kappa(i) /= para%hmc(info%ensemble(i))%kappa) then + call die("check_conf_info_header(): kappa inconsistent") + endif + + if (info%csw(i) /= para%hmc(info%ensemble(i))%csw) then + call die("check_conf_info_header(): csw inconsistent") + endif + + if (abs(info%csw_kappa(i) - & + para%hmc(info%ensemble(i))%csw_kappa) > 1e-13 ) then + call die("check_conf_info_header(): csw_kappa inconsistent") + endif + + if (info%h(i) /= para%hmc(info%ensemble(i))%h) then + call die("check_conf_info_header(): h inconsistent") + endif + enddo + endif + +end + +!------------------------------------------------------------------------------- +subroutine read_cksum(restart, unit, cksum, LT, i_temper, file_name) + + use typedef_cksum + implicit none + logical :: restart + integer :: unit, LT, i_temper, t + FILENAME, external :: file_name + type(type_cksum), dimension(0:LT-1) :: cksum + + call pos_keyword(unit, ">BeginCheckSum") + read(unit,*) + + do t = 0, LT - 1 + read(unit,*) cksum(t)%file, cksum(t)%sum + + if (restart) then + if (cksum(t)%file /= file_name(i_temper, t)) then + call die("read_cksum(): file names inconsistent") + endif + endif + enddo + +end + +!------------------------------------------------------------------------------- +subroutine write_cksum(unit, cksum, LT) + + use typedef_cksum + use module_function_decl + implicit none + + integer :: unit, LT, i + type(type_cksum), dimension(LT) :: cksum + + if (my_pe() == 0) then + call begin(unit, "CheckSum") + do i = 1, LT + write(unit, *) trim(cksum(i)%file), cksum(i)%sum, cksum(i)%bytes + enddo + call end_A(unit, "CheckSum") + endif + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/cooling.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/cooling.F90 new file mode 100644 index 0000000000000000000000000000000000000000..fba73b1a8947cfb6f10f85cd709722a15f0e5a40 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/cooling.F90 @@ -0,0 +1,287 @@ +!------------------------------------------------------------------------------- +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! cooling.F90 - measurement of the topological charge using standard cooling +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_cooling + + integer, save :: n_cool + logical, dimension(:), pointer, save :: measure_q + +end + +!------------------------------------------------------------------------------- +subroutine init_cooling(list) + + use module_cooling + implicit none + character(*), intent(in) :: list + integer :: i, iostat + + if (list /= " ") then + open(ULIST, file = list, action = "read", status = "old") + + iostat = 0 + n_cool = 0 + do while (iostat == 0) + read(ULIST, *, iostat = iostat) i + if (i < 0) then + call die("init_cooling(): list has negative entries") + else + n_cool = max(n_cool, i) + endif + enddo + + allocate(measure_q(0:n_cool)) + + do i = 0, n_cool + measure_q(i) = .false. + enddo + + rewind(ULIST) + + iostat = 0 + do while (iostat == 0) + read(ULIST, *, iostat = iostat) i + measure_q(i) = .true. + enddo + + close(ULIST) + + else + n_cool = -1 + endif + +end + +!------------------------------------------------------------------------------- +subroutine cooling(u_in, traj, i_ensemble1, i_ensemble2) + + use module_cooling + use module_function_decl + use module_p_interface + use module_vol + implicit none + + integer, intent(in) :: traj, i_ensemble1, i_ensemble2 + GAUGE_FIELD, intent(in) :: u_in + P_GAUGE_FIELD, save :: u + integer :: i + character(len = *), parameter :: key = "%Qc" + REAL :: q, plaq + + if (n_cool < 0) return + + TIMING_START(timing_bin_cooling) + + ALLOCATE_G_FIELD(u) + + u = u_in + + call begin(UREC, "Cooling") + + if (my_pe() == 0) then + write(UREC, 400) "T", key, "traj", "e", "f", "i_cool", & + "Q_cool", "PlaqEnergy" + endif + + do i = 0, n_cool + if (measure_q(i)) then + call conf_check(u) + call top_charge(q, u, plaq) + if (my_pe() == 0) then + write(UREC, 410) key, traj, i_ensemble1, i_ensemble2, i, q, plaq + endif + endif + if (i < n_cool) call conf_relax(u) + enddo + + call end_A(UREC, "Cooling") + +400 format (1x, 2a, a6, 2a3, a8, a15, a15) +410 format (1x, a4, i6, 2i3, i8, f15.6, f15.10) + + TIMING_STOP(timing_bin_cooling) + +end + +!------------------------------------------------------------------------------- +subroutine conf_relax(u) + + use module_vol + implicit none + + GAUGE_FIELD, intent(inout) :: u + SU3 :: uuu, w, a + SU3, parameter :: su3_one = reshape( & + (/ ONE,ZERO,ZERO, & + ZERO,ONE,ZERO, & + ZERO,ZERO,ONE /), & + (/ NCOL, NCOL /)) + REAL :: p0, p1, p2, p3, fac + REAL :: a0, a1, a2, a3 + integer :: i, eo, mu, k, c1, c2 + + do mu = 1, DIM + do eo = EVEN, ODD + !$omp parallel do private(uuu, w, a, p0, p1, p2, p3, fac, & + !$omp a0, a1, a2, a3, k, c1, c2) + do i = 1, VOLH + call staple(uuu, u, i, eo, mu) + do k = 1, NCOL + if (k == 1) then + c1 = 1 + c2 = 2 + else if (k == 2) then + c1 = 1 + c2 = 3 + else if (k == 3) then + c1 = 2 + c2 = 3 + endif + + call uu(w, u(1, 1, i, eo, mu), uuu) ! w = u * uuu + + p0 = Re(w(c1, c1)) + Re(w(c2, c2)) + p1 = -(Im(w(c1, c2)) + Im(w(c2, c1))) + p2 = -(Re(w(c1, c2)) - Re(w(c2, c1))) + p3 = -(Im(w(c1, c1)) - Im(w(c2, c2))) + + fac = ONE / sqrt(p0**2 + p1**2 + p2**2 + p3**2) + + a0 = fac * p0 + a1 = fac * p1 + a2 = fac * p2 + a3 = fac * p3 + + a = su3_one + + a(c1, c1) = cmplx( a0, a3) + a(c1, c2) = cmplx( a2, a1) + a(c2, c1) = cmplx(-a2, a1) + a(c2, c2) = cmplx( a0, -a3) + + call u_update(u(1, 1, i, eo, mu), a) ! u = a * u + + enddo ! k + enddo ! i + call xbound_g(u, eo, mu) + enddo ! eo + enddo ! mu +end + +!------------------------------------------------------------------------------- +subroutine top_charge(qq, u, plaq_energy) + + use module_vol + use module_nn + implicit none + + REAL, intent(out) :: qq, plaq_energy + GAUGE_FIELD, intent(in) :: u + + integer :: e, o, mu, nu, i, j1, j2, j3, j4, j5, j6, j7, c1, c2 + SU3 :: uuu, left, right + COMPLEX, dimension(NCOL, NCOL, DIM - 1, DIM) :: ut ! U~(x,mu,nu) - h.c. + REAL :: q, plaq + REAL, external :: global_sum, Re_Tr_uu + + !---------------------------------------------------------------------- + ! + ! (j3, e) --->--- (j2, o) ---<--- x + ! | | | + ! | | | + ! ^ v ^ nu + ! | | | ^ + ! | | | | + ! (j4, o) ---<--- (i,e) --->--- (j1, o) x--> mu + ! | | | + ! | | | + ! v ^ v + ! | | | + ! | | | + ! (j5, e) --->--- (j6, o) ---<--- (j7, e) + ! + !---------------------------------------------------------------------- + + q = 0 + plaq = 0 + + do e = EVEN, ODD + o = EVEN + ODD - e + !$omp parallel do reduction(+: q, plaq) private(uuu, left, right, ut, & + !$omp mu, nu, i, j1, j2, j3, j4, j5, j6, j7, c1, c2) + do i = 1, VOLH + do mu = 1, DIM - 1 + do nu = mu + 1, DIM + + j1 = nn(i, e, mu, FWD) + j2 = nn(i, e, nu, FWD) + j3 = nn(j2, o, mu, BWD) + j4 = nn(j3, e, nu, BWD) + j5 = nn(j4, o, nu, BWD) + j6 = nn(j5, e, mu, FWD) + j7 = nn(j6, o, mu, FWD) + + uuu = 0 + + call uuu_fwd(uuu, u(1, 1, j1, o, nu), & + u(1, 1, j2, o, mu), & + u(1, 1, i, e, nu)) + + plaq = plaq + Re_Tr_uu(uuu, u(1, 1, i, e, mu)) + + call uuu_bwd_m(uuu, u(1, 1, j7, e, nu), & + u(1, 1, j6, o, mu), & + u(1, 1, j6, o, nu)) + + call uu(right, u(1, 1, i, e, mu), uuu) + + uuu = 0 + + call uuu_fwd(uuu, u(1, 1, i, e, nu), & + u(1, 1, j3, e, mu), & + u(1, 1, j4, o, nu)) + + call uuu_bwd_m(uuu, u(1, 1, j6, o, nu), & + u(1, 1, j5, e, mu), & + u(1, 1, j5, e, nu)) + + call uu(left, uuu, u(1, 1, j4, o, mu)) + + do c2 = 1, NCOL + do c1 = 1, NCOL + ut(c1, c2, mu, nu) = right(c1, c2) - conjg(right(c2, c1)) & + + left(c1, c2) - conjg(left(c2, c1)) + enddo + enddo + enddo ! nu + enddo ! mu + + q = q + Re_Tr_uu(ut(1, 1, 1, 2), ut(1, 1, 3, 4)) & + - Re_Tr_uu(ut(1, 1, 1, 3), ut(1, 1, 2, 4)) & + + Re_Tr_uu(ut(1, 1, 1, 4), ut(1, 1, 2, 3)) + + enddo ! i + enddo ! e/o + + q = global_sum(q) + plaq = global_sum(plaq) + + q = -q / (256 * PI**2) + qq = q + plaq_energy = ONE - plaq / (THREE * SIX * volume) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D.F90 new file mode 100644 index 0000000000000000000000000000000000000000..dc1887d20347cadf207611ee826a99426962eb8d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D.F90 @@ -0,0 +1,52 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! D.F90 - multiplication with the Wilson hopping matrix D (or D^\dagger) +! (optimization for Cray T3E) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine NAME(e, o, out, in, u) + +! out := NAME in +! +! NAME = d or d_dag +! +! out is of type "e" = EVEN or ODD +! in is of type "o" = ODD or EVEN + + use module_nn + use module_vol + implicit none + + integer :: e, o + SPINCOL_FIELD :: out, in + GAUGE_FIELD :: u + + TIMING_START(STRCAT(timing_bin_, NAME)) + + call xbound_sc_field(in) + + call STRCAT(NAME, _t )(out, in, u(1, 1, 1, e, 4), u(1, 1, 1, o, 4), & + nn(1, e, 4, FWD), nn(1, e, 4, BWD), VOLH) + call STRCAT(NAME, _zb)(out, in, u(1, 1, 1, o, 3), nn(1, e, 3, BWD), VOLH) + call STRCAT(NAME, _zf)(out, in, u(1, 1, 1, e, 3), nn(1, e, 3, FWD), VOLH) + call STRCAT(NAME, _yb)(out, in, u(1, 1, 1, o, 2), nn(1, e, 2, BWD), VOLH) + call STRCAT(NAME, _yf)(out, in, u(1, 1, 1, e, 2), nn(1, e, 2, FWD), VOLH) + call STRCAT(NAME, _xb)(out, in, u(1, 1, 1, o, 1), nn(1, e, 1, BWD), VOLH) + call STRCAT(NAME, _xf)(out, in, u(1, 1, 1, e, 1), nn(1, e, 1, FWD), VOLH) + + TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D2.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D2.F90 new file mode 100644 index 0000000000000000000000000000000000000000..fe9f9fc5618e274f62ee710478b11d8a2145bb59 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D2.F90 @@ -0,0 +1,52 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! D2.F90 - multiplication with the Wilson hopping matrix D (or D^\dagger) +! (optimization for Hitachi SR8000) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine NAME(e, o, out, in, u) + +! out := NAME in +! +! NAME = d or d_dag +! +! out is of type "e" = EVEN or ODD +! in is of type "o" = ODD or EVEN + + use module_nn + use module_vol + implicit none + + integer :: e, o + SPINCOL_FIELD :: out, in + GAUGE_FIELD :: u + + TIMING_START(STRCAT(timing_bin_, NAME)) + + call xbound_sc_field(in) + + call STRCAT(NAME, _t )(out, in, u(1, 1, 1, e, 4), u(1, 1, 1, o, 4), & + nn(1, e, 4, FWD), nn(1, e, 4, BWD), VOLH) + call STRCAT(NAME, _zf)(out, in, u(1, 1, 1, e, 3), u(1, 1, 1, o, 3), & + nn(1, e, 3, FWD), nn(1, e, 3, BWD), VOLH) + call STRCAT(NAME, _yf)(out, in, u(1, 1, 1, e, 2), u(1, 1, 1, o, 2), & + nn(1, e, 2, FWD), nn(1, e, 2, BWD), VOLH) + call STRCAT(NAME, _xf)(out, in, u(1, 1, 1, e, 1), u(1, 1, 1, o, 1), & + nn(1, e, 1, FWD), nn(1, e, 1, BWD), VOLH) + + TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D21.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D21.F90 new file mode 100644 index 0000000000000000000000000000000000000000..0ce1150ece84f5ace8a67a3871a369565b63d570 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D21.F90 @@ -0,0 +1,173 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! D21.F90 - multiplication with the Wilson hopping matrix D (or D^\dagger) +! projection onto 2 spincol components +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine NAME(e, o, out, in, u) + +! out := NAME in +! +! NAME = d or d_dag +! +! out is of type "e" = EVEN or ODD +! in is of type "o" = ODD or EVEN + + use module_d21 + use module_nn + use module_vol + use module_p_interface + implicit none + + integer :: e, o + SPINCOL_FIELD :: out, in + GAUGE_FIELD :: u + + + TIMING_START(STRCAT(timing_bin_, NAME)) + + ALLOCATE_SC2_FIELD(a) + + call STRCAT(NAME, _projection)(a, in) + +!!call xbound_sc2_field(a) + call xbound_sc2_field_i(a) + + call STRCAT(NAME, _t )(out, a(1, 1, 1, 4, FWD), a(1, 1, 1, 4, BWD), & + u(1, 1, 1, e, 4), u(1, 1, 1, o, 4), & + nn(1, e, 4, FWD), nn(1, e, 4, BWD), VOLH) + + call STRCAT(NAME, _zf)(out, a(1, 1, 1, 3, FWD), a(1, 1, 1, 3, BWD), & + u(1, 1, 1, e, 3), u(1, 1, 1, o, 3), & + nn(1, e, 3, FWD), nn(1, e, 3, BWD), VOLH) + + call STRCAT(NAME, _yf)(out, a(1, 1, 1, 2, FWD), a(1, 1, 1, 2, BWD), & + u(1, 1, 1, e, 2), u(1, 1, 1, o, 2), & + nn(1, e, 2, FWD), nn(1, e, 2, BWD), VOLH) + + call STRCAT(NAME, _xf)(out, a(1, 1, 1, 1, FWD), a(1, 1, 1, 1, BWD), & + u(1, 1, 1, e, 1), u(1, 1, 1, o, 1), & + nn(1, e, 1, FWD), nn(1, e, 1, BWD), VOLH) + + TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!------------------------------------------------------------------------------- +subroutine STRCAT(NAME, _projection)(out, in) + + use module_vol + implicit none + + SC2_FIELD, intent(out) :: out + SPINCOL_FIELD, intent(in) :: in + integer :: i + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + TIMING_START(timing_bin_sc2_projection) + +#ifdef DAGGER +# define PLUS - +# define MINUS + +# define D_T_ONE 3 +# define D_T_TWO 4 +# define D_T_THREE 1 +# define D_T_FOUR 2 +#else +# define PLUS + +# define MINUS - +# define D_T_ONE 1 +# define D_T_TWO 2 +# define D_T_THREE 3 +# define D_T_FOUR 4 +#endif + + + !$omp parallel do + do i = 1, volh + + out(1, 1, i, 1, FWD) = in(1, 1, i) MINUS i_times(in(4, 1, i)) + out(2, 1, i, 1, FWD) = in(2, 1, i) MINUS i_times(in(3, 1, i)) + out(1, 1, i, 1, BWD) = in(1, 1, i) PLUS i_times(in(4, 1, i)) + out(2, 1, i, 1, BWD) = in(2, 1, i) PLUS i_times(in(3, 1, i)) + + out(1, 2, i, 1, FWD) = in(1, 2, i) MINUS i_times(in(4, 2, i)) + out(2, 2, i, 1, FWD) = in(2, 2, i) MINUS i_times(in(3, 2, i)) + out(1, 2, i, 1, BWD) = in(1, 2, i) PLUS i_times(in(4, 2, i)) + out(2, 2, i, 1, BWD) = in(2, 2, i) PLUS i_times(in(3, 2, i)) + + out(1, 3, i, 1, FWD) = in(1, 3, i) MINUS i_times(in(4, 3, i)) + out(2, 3, i, 1, FWD) = in(2, 3, i) MINUS i_times(in(3, 3, i)) + out(1, 3, i, 1, BWD) = in(1, 3, i) PLUS i_times(in(4, 3, i)) + out(2, 3, i, 1, BWD) = in(2, 3, i) PLUS i_times(in(3, 3, i)) + + + out(1, 1, i, 2, FWD) = in(1, 1, i) MINUS in(4, 1, i) + out(2, 1, i, 2, FWD) = in(2, 1, i) PLUS in(3, 1, i) + out(1, 1, i, 2, BWD) = in(1, 1, i) PLUS in(4, 1, i) + out(2, 1, i, 2, BWD) = in(2, 1, i) MINUS in(3, 1, i) + + out(1, 2, i, 2, FWD) = in(1, 2, i) MINUS in(4, 2, i) + out(2, 2, i, 2, FWD) = in(2, 2, i) PLUS in(3, 2, i) + out(1, 2, i, 2, BWD) = in(1, 2, i) PLUS in(4, 2, i) + out(2, 2, i, 2, BWD) = in(2, 2, i) MINUS in(3, 2, i) + + out(1, 3, i, 2, FWD) = in(1, 3, i) MINUS in(4, 3, i) + out(2, 3, i, 2, FWD) = in(2, 3, i) PLUS in(3, 3, i) + out(1, 3, i, 2, BWD) = in(1, 3, i) PLUS in(4, 3, i) + out(2, 3, i, 2, BWD) = in(2, 3, i) MINUS in(3, 3, i) + + + out(1, 1, i, 3, FWD) = in(1, 1, i) MINUS i_times(in(3, 1, i)) + out(2, 1, i, 3, FWD) = in(2, 1, i) PLUS i_times(in(4, 1, i)) + out(1, 1, i, 3, BWD) = in(1, 1, i) PLUS i_times(in(3, 1, i)) + out(2, 1, i, 3, BWD) = in(2, 1, i) MINUS i_times(in(4, 1, i)) + + out(1, 2, i, 3, FWD) = in(1, 2, i) MINUS i_times(in(3, 2, i)) + out(2, 2, i, 3, FWD) = in(2, 2, i) PLUS i_times(in(4, 2, i)) + out(1, 2, i, 3, BWD) = in(1, 2, i) PLUS i_times(in(3, 2, i)) + out(2, 2, i, 3, BWD) = in(2, 2, i) MINUS i_times(in(4, 2, i)) + + out(1, 3, i, 3, FWD) = in(1, 3, i) MINUS i_times(in(3, 3, i)) + out(2, 3, i, 3, FWD) = in(2, 3, i) PLUS i_times(in(4, 3, i)) + out(1, 3, i, 3, BWD) = in(1, 3, i) PLUS i_times(in(3, 3, i)) + out(2, 3, i, 3, BWD) = in(2, 3, i) MINUS i_times(in(4, 3, i)) + + + out(1, 1, i, 4, FWD) = in(D_T_THREE, 1, i) + out(2, 1, i, 4, FWD) = in(D_T_FOUR, 1, i) + out(1, 1, i, 4, BWD) = in(D_T_ONE, 1, i) + out(2, 1, i, 4, BWD) = in(D_T_TWO, 1, i) + + out(1, 2, i, 4, FWD) = in(D_T_THREE, 2, i) + out(2, 2, i, 4, FWD) = in(D_T_FOUR, 2, i) + out(1, 2, i, 4, BWD) = in(D_T_ONE, 2, i) + out(2, 2, i, 4, BWD) = in(D_T_TWO, 2, i) + + out(1, 3, i, 4, FWD) = in(D_T_THREE, 3, i) + out(2, 3, i, 4, FWD) = in(D_T_FOUR, 3, i) + out(1, 3, i, 4, BWD) = in(D_T_ONE, 3, i) + out(2, 3, i, 4, BWD) = in(D_T_TWO, 3, i) + + enddo + + TIMING_STOP(timing_bin_sc2_projection) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D21xyzt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D21xyzt.F90 new file mode 100644 index 0000000000000000000000000000000000000000..db8ef9618b9be3f0f02b2d7ba2385734a231d43e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D21xyzt.F90 @@ -0,0 +1,188 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! D21xyzt.F90 - routines needed in D21.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +# define GAMMA_AB1(C) a_bwd(1, C, jb) +# define GAMMA_AB2(C) a_bwd(2, C, jb) +# define GAMMA_AF1(C) a_fwd(1, C, jf) +# define GAMMA_AF2(C) a_fwd(2, C, jf) + +#ifdef DIR_T + +#ifdef DAGGER +# define GAMMA_B1(C) bf1_ ## C +# define GAMMA_B2(C) bf2_ ## C +# define GAMMA_B3(C) bb1_ ## C +# define GAMMA_B4(C) bb2_ ## C +#else +# define GAMMA_B1(C) bb1_ ## C +# define GAMMA_B2(C) bb2_ ## C +# define GAMMA_B3(C) bf1_ ## C +# define GAMMA_B4(C) bf2_ ## C +#endif + +# define UPDATE_B(S, C) b(S, C, i) = TWO * GAMMA_B ## S ## (C) + +#else + +#ifdef DAGGER +# define PLUS - +# define MINUS + +#else +# define PLUS + +# define MINUS - +#endif + +#ifdef DIR_X +# define GAMMA_B3(C) MINUS i_times(bb2_ ## C) PLUS i_times(bf2_ ## C) +# define GAMMA_B4(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C) +#endif + +#ifdef DIR_Y +# define GAMMA_B3(C) MINUS bb2_ ## C PLUS bf2_ ## C +# define GAMMA_B4(C) PLUS bb1_ ## C MINUS bf1_ ## C +#endif + +#ifdef DIR_Z +# define GAMMA_B3(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C) +# define GAMMA_B4(C) PLUS i_times(bb2_ ## C) MINUS i_times(bf2_ ## C) +#endif + +# define GAMMA_B1(C) + bb1_ ## C + bf1_ ## C +# define GAMMA_B2(C) + bb2_ ## C + bf2_ ## C + +# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) GAMMA_B ## S ## (C) + +#endif + +!------------------------------------------------------------------------------- +subroutine NAME(b, a_fwd, a_bwd, u_e, u_o, nn_fwd, nn_bwd, volh) + + implicit none + + COMPLEX, dimension (NDIRAC, NCOL, *), intent(inout) :: b + COMPLEX, dimension (2, NCOL, *), intent(in) :: a_fwd, a_bwd + COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u_e, u_o + INTEGER, dimension (*), intent(in) :: nn_fwd, nn_bwd + integer :: volh + + integer :: i, jf, jb + + COMPLEX :: ab1, ab2, af1, af2 + COMPLEX :: bf1_1, bf2_1 + COMPLEX :: bf1_2, bf2_2 + COMPLEX :: bf1_3, bf2_3 + COMPLEX :: bb1_1, bb2_1 + COMPLEX :: bb1_2, bb2_2 + COMPLEX :: bb1_3, bb2_3 + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + + TIMING_START(STRCAT(timing_bin_, NAME)) + + !$omp parallel do private(jf, jb, ab1, ab2, af1, af2, & + !$omp bf1_1, bf2_1, bf1_2, bf2_2, bf1_3, bf2_3, & + !$omp bb1_1, bb2_1, bb1_2, bb2_2, bb1_3, bb2_3) + do i = 1, volh + jb = nn_bwd(i) + + ab1 = GAMMA_AB1(1) + ab2 = GAMMA_AB2(1) + + bb1_1 = ab1 * conjg(u_o(1, 1, jb)) + bb2_1 = ab2 * conjg(u_o(1, 1, jb)) + bb1_2 = ab1 * conjg(u_o(1, 2, jb)) + bb2_2 = ab2 * conjg(u_o(1, 2, jb)) + bb1_3 = ab1 * conjg(u_o(1, 3, jb)) + bb2_3 = ab2 * conjg(u_o(1, 3, jb)) + + jf = nn_fwd(i) + + af1 = GAMMA_AF1(1) + af2 = GAMMA_AF2(1) + + bf1_1 = af1 * u_e(1, 1, i) + bf2_1 = af2 * u_e(1, 1, i) + bf1_2 = af1 * u_e(2, 1, i) + bf2_2 = af2 * u_e(2, 1, i) + bf1_3 = af1 * u_e(3, 1, i) + bf2_3 = af2 * u_e(3, 1, i) + + ab1 = GAMMA_AB1(2) + ab2 = GAMMA_AB2(2) + + bb1_1 = bb1_1 + ab1 * conjg(u_o(2, 1, jb)) + bb2_1 = bb2_1 + ab2 * conjg(u_o(2, 1, jb)) + bb1_2 = bb1_2 + ab1 * conjg(u_o(2, 2, jb)) + bb2_2 = bb2_2 + ab2 * conjg(u_o(2, 2, jb)) + bb1_3 = bb1_3 + ab1 * conjg(u_o(2, 3, jb)) + bb2_3 = bb2_3 + ab2 * conjg(u_o(2, 3, jb)) + + af1 = GAMMA_AF1(2) + af2 = GAMMA_AF2(2) + + bf1_1 = bf1_1 + af1 * u_e(1, 2, i) + bf2_1 = bf2_1 + af2 * u_e(1, 2, i) + bf1_2 = bf1_2 + af1 * u_e(2, 2, i) + bf2_2 = bf2_2 + af2 * u_e(2, 2, i) + bf1_3 = bf1_3 + af1 * u_e(3, 2, i) + bf2_3 = bf2_3 + af2 * u_e(3, 2, i) + + ab1 = GAMMA_AB1(3) + ab2 = GAMMA_AB2(3) + + bb1_1 = bb1_1 + ab1 * conjg(u_o(3, 1, jb)) + bb2_1 = bb2_1 + ab2 * conjg(u_o(3, 1, jb)) + bb1_2 = bb1_2 + ab1 * conjg(u_o(3, 2, jb)) + bb2_2 = bb2_2 + ab2 * conjg(u_o(3, 2, jb)) + bb1_3 = bb1_3 + ab1 * conjg(u_o(3, 3, jb)) + bb2_3 = bb2_3 + ab2 * conjg(u_o(3, 3, jb)) + + af1 = GAMMA_AF1(3) + af2 = GAMMA_AF2(3) + + bf1_1 = bf1_1 + af1 * u_e(1, 3, i) + bf2_1 = bf2_1 + af2 * u_e(1, 3, i) + bf1_2 = bf1_2 + af1 * u_e(2, 3, i) + bf2_2 = bf2_2 + af2 * u_e(2, 3, i) + bf1_3 = bf1_3 + af1 * u_e(3, 3, i) + bf2_3 = bf2_3 + af2 * u_e(3, 3, i) + + + UPDATE_B(1, 1) + UPDATE_B(2, 1) + UPDATE_B(3, 1) + UPDATE_B(4, 1) + + UPDATE_B(1, 2) + UPDATE_B(2, 2) + UPDATE_B(3, 2) + UPDATE_B(4, 2) + + UPDATE_B(1, 3) + UPDATE_B(2, 3) + UPDATE_B(3, 3) + UPDATE_B(4, 3) + + enddo + + TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D2xyzt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D2xyzt.F90 new file mode 100644 index 0000000000000000000000000000000000000000..01e0aa6e50b47ce1419a946d11e0ceb11fe02867 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D2xyzt.F90 @@ -0,0 +1,202 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! D2xyzt.F90 - routines needed in D2.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +#ifdef DIR_T + +#ifdef DAGGER +# define GAMMA_AB1(C) a(3, C, jb) +# define GAMMA_AB2(C) a(4, C, jb) +# define GAMMA_AF1(C) a(1, C, jf) +# define GAMMA_AF2(C) a(2, C, jf) +# define GAMMA_B1(C) bf1_ ## C +# define GAMMA_B2(C) bf2_ ## C +# define GAMMA_B3(C) bb1_ ## C +# define GAMMA_B4(C) bb2_ ## C +#else +# define GAMMA_AB1(C) a(1, C, jb) +# define GAMMA_AB2(C) a(2, C, jb) +# define GAMMA_AF1(C) a(3, C, jf) +# define GAMMA_AF2(C) a(4, C, jf) +# define GAMMA_B1(C) bb1_ ## C +# define GAMMA_B2(C) bb2_ ## C +# define GAMMA_B3(C) bf1_ ## C +# define GAMMA_B4(C) bf2_ ## C +#endif + +# define UPDATE_B(S, C) b(S, C, i) = TWO * GAMMA_B ## S ## (C) + +#else + +#ifdef DAGGER +# define PLUS - +# define MINUS + +#else +# define PLUS + +# define MINUS - +#endif + +#ifdef DIR_X +# define GAMMA_AB1(C) a(1, C, jb) PLUS i_times(a(4, C, jb)) +# define GAMMA_AB2(C) a(2, C, jb) PLUS i_times(a(3, C, jb)) +# define GAMMA_AF1(C) a(1, C, jf) MINUS i_times(a(4, C, jf)) +# define GAMMA_AF2(C) a(2, C, jf) MINUS i_times(a(3, C, jf)) +# define GAMMA_B3(C) MINUS i_times(bb2_ ## C) PLUS i_times(bf2_ ## C) +# define GAMMA_B4(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C) +#endif + +#ifdef DIR_Y +# define GAMMA_AB1(C) a(1, C, jb) PLUS a(4, C, jb) +# define GAMMA_AB2(C) a(2, C, jb) MINUS a(3, C, jb) +# define GAMMA_AF1(C) a(1, C, jf) MINUS a(4, C, jf) +# define GAMMA_AF2(C) a(2, C, jf) PLUS a(3, C, jf) +# define GAMMA_B3(C) MINUS bb2_ ## C PLUS bf2_ ## C +# define GAMMA_B4(C) PLUS bb1_ ## C MINUS bf1_ ## C +#endif + +#ifdef DIR_Z +# define GAMMA_AB1(C) a(1, C, jb) PLUS i_times(a(3, C, jb)) +# define GAMMA_AB2(C) a(2, C, jb) MINUS i_times(a(4, C, jb)) +# define GAMMA_AF1(C) a(1, C, jf) MINUS i_times(a(3, C, jf)) +# define GAMMA_AF2(C) a(2, C, jf) PLUS i_times(a(4, C, jf)) +# define GAMMA_B3(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C) +# define GAMMA_B4(C) PLUS i_times(bb2_ ## C) MINUS i_times(bf2_ ## C) +#endif + +# define GAMMA_B1(C) + bb1_ ## C + bf1_ ## C +# define GAMMA_B2(C) + bb2_ ## C + bf2_ ## C + +# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) GAMMA_B ## S ## (C) + +#endif + +!------------------------------------------------------------------------------- +subroutine NAME(b, a, u_e, u_o, nn_fwd, nn_bwd, volh) + + implicit none + + COMPLEX, dimension (NDIRAC, NCOL, *), intent(inout) :: b + COMPLEX, dimension (NDIRAC, NCOL, *), intent(in) :: a + COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u_e, u_o + INTEGER, dimension (*), intent(in) :: nn_fwd, nn_bwd + integer :: volh + + integer :: i, jf, jb + + COMPLEX :: ab1, ab2, af1, af2 + COMPLEX :: bf1_1, bf2_1 + COMPLEX :: bf1_2, bf2_2 + COMPLEX :: bf1_3, bf2_3 + COMPLEX :: bb1_1, bb2_1 + COMPLEX :: bb1_2, bb2_2 + COMPLEX :: bb1_3, bb2_3 + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + TIMING_START(STRCAT(timing_bin_, NAME)) + + !$omp parallel do private(jf, jb, ab1, ab2, af1, af2, & + !$omp bf1_1, bf2_1, bf1_2, bf2_2, bf1_3, bf2_3, & + !$omp bb1_1, bb2_1, bb1_2, bb2_2, bb1_3, bb2_3) + do i = 1, volh + jb = nn_bwd(i) + + ab1 = GAMMA_AB1(1) + ab2 = GAMMA_AB2(1) + + bb1_1 = ab1 * conjg(u_o(1, 1, jb)) + bb2_1 = ab2 * conjg(u_o(1, 1, jb)) + bb1_2 = ab1 * conjg(u_o(1, 2, jb)) + bb2_2 = ab2 * conjg(u_o(1, 2, jb)) + bb1_3 = ab1 * conjg(u_o(1, 3, jb)) + bb2_3 = ab2 * conjg(u_o(1, 3, jb)) + + jf = nn_fwd(i) + + af1 = GAMMA_AF1(1) + af2 = GAMMA_AF2(1) + + bf1_1 = af1 * u_e(1, 1, i) + bf2_1 = af2 * u_e(1, 1, i) + bf1_2 = af1 * u_e(2, 1, i) + bf2_2 = af2 * u_e(2, 1, i) + bf1_3 = af1 * u_e(3, 1, i) + bf2_3 = af2 * u_e(3, 1, i) + + ab1 = GAMMA_AB1(2) + ab2 = GAMMA_AB2(2) + + bb1_1 = bb1_1 + ab1 * conjg(u_o(2, 1, jb)) + bb2_1 = bb2_1 + ab2 * conjg(u_o(2, 1, jb)) + bb1_2 = bb1_2 + ab1 * conjg(u_o(2, 2, jb)) + bb2_2 = bb2_2 + ab2 * conjg(u_o(2, 2, jb)) + bb1_3 = bb1_3 + ab1 * conjg(u_o(2, 3, jb)) + bb2_3 = bb2_3 + ab2 * conjg(u_o(2, 3, jb)) + + af1 = GAMMA_AF1(2) + af2 = GAMMA_AF2(2) + + bf1_1 = bf1_1 + af1 * u_e(1, 2, i) + bf2_1 = bf2_1 + af2 * u_e(1, 2, i) + bf1_2 = bf1_2 + af1 * u_e(2, 2, i) + bf2_2 = bf2_2 + af2 * u_e(2, 2, i) + bf1_3 = bf1_3 + af1 * u_e(3, 2, i) + bf2_3 = bf2_3 + af2 * u_e(3, 2, i) + + ab1 = GAMMA_AB1(3) + ab2 = GAMMA_AB2(3) + + bb1_1 = bb1_1 + ab1 * conjg(u_o(3, 1, jb)) + bb2_1 = bb2_1 + ab2 * conjg(u_o(3, 1, jb)) + bb1_2 = bb1_2 + ab1 * conjg(u_o(3, 2, jb)) + bb2_2 = bb2_2 + ab2 * conjg(u_o(3, 2, jb)) + bb1_3 = bb1_3 + ab1 * conjg(u_o(3, 3, jb)) + bb2_3 = bb2_3 + ab2 * conjg(u_o(3, 3, jb)) + + af1 = GAMMA_AF1(3) + af2 = GAMMA_AF2(3) + + bf1_1 = bf1_1 + af1 * u_e(1, 3, i) + bf2_1 = bf2_1 + af2 * u_e(1, 3, i) + bf1_2 = bf1_2 + af1 * u_e(2, 3, i) + bf2_2 = bf2_2 + af2 * u_e(2, 3, i) + bf1_3 = bf1_3 + af1 * u_e(3, 3, i) + bf2_3 = bf2_3 + af2 * u_e(3, 3, i) + + + UPDATE_B(1, 1) + UPDATE_B(2, 1) + UPDATE_B(3, 1) + UPDATE_B(4, 1) + + UPDATE_B(1, 2) + UPDATE_B(2, 2) + UPDATE_B(3, 2) + UPDATE_B(4, 2) + + UPDATE_B(1, 3) + UPDATE_B(2, 3) + UPDATE_B(3, 3) + UPDATE_B(4, 3) + + enddo + + TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D3.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D3.F90 new file mode 100644 index 0000000000000000000000000000000000000000..f2916b5d3d041ca90c915f51913ecbfb90c60b1f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D3.F90 @@ -0,0 +1,116 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! D3.F90 - multiplication with the Wilson hopping matrix D (or D^\dagger) +! (optimization for Hitachi SR8000: hybrid programming model, +! MPI + OpenMP + overlapping communication and computation) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine NAME(ee, oo, out, in, u) + +! out := NAME in +! +! NAME = d or d_dag +! +! out is of type "e" = EVEN or ODD +! in is of type "o" = ODD or EVEN + + use module_nn + use module_vol + use module_thread + implicit none + + integer :: ee, oo + SPINCOL_FIELD :: out, in + GAUGE_FIELD :: u + + integer :: thread, i1, i2, omp_get_thread_num, e, o + + TIMING_START(STRCAT(timing_bin_, NAME)) + + call xbound_fill_buffer_y(in) + call xbound_fill_buffer_z(in) + + !$omp parallel private(thread, i1, i2, e, o) + + thread = omp_get_thread_num() + e = ee + o = oo + + !$omp barrier + + i1 = xyz_start(thread) + i2 = xyz_end(thread) + + if (thread == 0) then + TIMING_START(timing_bin_d_xf) + call xbound_copy_buffer_y(in) + else + call STRCAT(NAME, _xf)(out, in, u(1, 1, 1, e, 1), & + u(1, 1, 1, o, 1), & + nn(1, e, 1, FWD), & + nn(1, e, 1, BWD), i1, i2) + endif + + !$omp barrier + + if (thread == 0) then + TIMING_STOP(timing_bin_d_xf) + TIMING_START(timing_bin_d_yf) + call xbound_copy_buffer_z(in) + !!call xbound_d3(in, 3) + else + call STRCAT(NAME, _yf)(out, in, u(1, 1, 1, e, 2), & + u(1, 1, 1, o, 2), & + nn(1, e, 2, FWD), & + nn(1, e, 2, BWD), i1, i2) + endif + + !$omp barrier + + if (thread == 0) then + TIMING_STOP(timing_bin_d_yf) + TIMING_START(timing_bin_d_zf) + call xbound_d3(in, 4) + else + call STRCAT(NAME, _zf)(out, in, u(1, 1, 1, e, 3), & + u(1, 1, 1, o, 3), & + nn(1, e, 3, FWD), & + nn(1, e, 3, BWD), i1, i2) + endif + + !$omp barrier + +#ifdef TIMING + if (thread == 0) then + TIMING_STOP(timing_bin_d_zf) + TIMING_START(timing_bin_d_t) + endif +#endif + + i1 = t_start(thread) + i2 = t_end(thread) + + call STRCAT(NAME, _t )(out, in, u(1, 1, 1, e, 4), & + u(1, 1, 1, o, 4), & + nn(1, e, 4, FWD), & + nn(1, e, 4, BWD), i1, i2) + + !$omp end parallel + + TIMING_STOP(timing_bin_d_t) + TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31.F90 new file mode 100644 index 0000000000000000000000000000000000000000..81da092c7102ce3c7954fe6c420110c38cdc368a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31.F90 @@ -0,0 +1,104 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! D3.F90 - multiplication with the Wilson hopping matrix D (or D^\dagger) +! (optimization for Hitachi SR8000: hybrid programming model, +! MPI + OpenMP + overlapping communication and computation) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine NAME(ee, oo, out, in, u) + +! out := NAME in +! +! NAME = d or d_dag +! +! out is of type "e" = EVEN or ODD +! in is of type "o" = ODD or EVEN + + use module_nn + use module_vol + use module_thread + implicit none + + integer :: ee, oo + SPINCOL_FIELD :: out, in + GAUGE_FIELD :: u + + integer :: thread, i1, i2, omp_get_thread_num, e, o + + TIMING_START(STRCAT(timing_bin_, NAME)) + + call xbound_fill_buffer_y(in) + call xbound_fill_buffer_z(in) + + !$omp parallel private(thread, i1, i2, e, o) + + thread = omp_get_thread_num() + e = ee + o = oo + + !$omp barrier + + i1 = xyz_start(thread) + i2 = xyz_end(thread) + + if (thread == 0) then + TIMING_START(timing_bin_d_xf) + call xbound_copy_buffer_y(in) + else + call STRCAT(NAME, _switch_0)(e, o, out, in, u, i1, i2, 1) + endif + + !$omp barrier + + if (thread == 0) then + TIMING_STOP(timing_bin_d_xf) + TIMING_START(timing_bin_d_yf) + call xbound_copy_buffer_z(in) + !!call xbound_d3(in, 3) + else + call STRCAT(NAME, _switch)(e, o, out, in, u, i1, i2, 2) + endif + + !$omp barrier + + if (thread == 0) then + TIMING_STOP(timing_bin_d_yf) + TIMING_START(timing_bin_d_zf) + call xbound_d3(in, 4) + else + call STRCAT(NAME, _switch)(e, o, out, in, u, i1, i2, 3) + endif + + !$omp barrier + +#ifdef TIMING + if (thread == 0) then + TIMING_STOP(timing_bin_d_zf) + TIMING_START(timing_bin_d_t) + endif +#endif + + i1 = t_start(thread) + i2 = t_end(thread) + + call STRCAT(NAME, _switch)(e, o, out, in, u, i1, i2, 4) + + !$omp end parallel + + TIMING_STOP(timing_bin_d_t) + TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31_switch.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31_switch.F90 new file mode 100644 index 0000000000000000000000000000000000000000..bf61467401f0815012d1aac01c2ff3ee15e2fc79 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31_switch.F90 @@ -0,0 +1,59 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! D3_switch.F90 - switch layer for arbitrary choice of "gamma_index" +! +!------------------------------------------------------------------------------- +# include "defs.h" + +#ifdef INIT +# define THE_NAME(a) STRCAT3(NAME, a, _0) +#else +# define THE_NAME(a) STRCAT(NAME, a) +#endif + +! NAME = d or d_dag +!------------------------------------------------------------------------------- +subroutine THE_NAME(_switch)(e, o, out, in, u, i1, i2, mu) + + use module_lattice + use module_nn + use module_vol + implicit none + + integer :: e, o + SPINCOL_FIELD :: out, in + GAUGE_FIELD :: u + integer :: i1, i2, mu + + select case (gamma_index(mu)) + case (1) + call THE_NAME(_xf)(out, in, u(1, 1, 1, e, 1), & + u(1, 1, 1, o, 1), & + nn(1, e, 1, FWD), & + nn(1, e, 1, BWD), i1, i2) + case (2) + call THE_NAME(_yf)(out, in, u(1, 1, 1, e, 2), & + u(1, 1, 1, o, 2), & + nn(1, e, 2, FWD), & + nn(1, e, 2, BWD), i1, i2) + case (3) + call THE_NAME(_zf)(out, in, u(1, 1, 1, e, 3), & + u(1, 1, 1, o, 3), & + nn(1, e, 3, FWD), & + nn(1, e, 3, BWD), i1, i2) + case (4) + call THE_NAME(_t )(out, in, u(1, 1, 1, e, 4), & + u(1, 1, 1, o, 4), & + nn(1, e, 4, FWD), & + nn(1, e, 4, BWD), i1, i2) + end select +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31xyzt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31xyzt.F90 new file mode 100644 index 0000000000000000000000000000000000000000..534277d9eb8aba4c5de9085dea78aafc7f572640 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31xyzt.F90 @@ -0,0 +1,213 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! D3xyzt.F90 - routines needed in D3.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +#ifdef DIR_T + +#ifdef DAGGER +# define GAMMA_AB1(C) a(3, C, jb) +# define GAMMA_AB2(C) a(4, C, jb) +# define GAMMA_AF1(C) a(1, C, jf) +# define GAMMA_AF2(C) a(2, C, jf) +# define GAMMA_B1(C) bf1_ ## C +# define GAMMA_B2(C) bf2_ ## C +# define GAMMA_B3(C) bb1_ ## C +# define GAMMA_B4(C) bb2_ ## C +#else +# define GAMMA_AB1(C) a(1, C, jb) +# define GAMMA_AB2(C) a(2, C, jb) +# define GAMMA_AF1(C) a(3, C, jf) +# define GAMMA_AF2(C) a(4, C, jf) +# define GAMMA_B1(C) bb1_ ## C +# define GAMMA_B2(C) bb2_ ## C +# define GAMMA_B3(C) bf1_ ## C +# define GAMMA_B4(C) bf2_ ## C +#endif + +#ifdef INIT +# define UPDATE_B(S, C) b(S, C, i) = TWO * GAMMA_B ## S ## (C) +#else +# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) + TWO * GAMMA_B ## S ## (C) +#endif + +#else + +#ifdef DAGGER +# define PLUS - +# define MINUS + +#else +# define PLUS + +# define MINUS - +#endif + +#ifdef DIR_X +# define GAMMA_AB1(C) a(1, C, jb) PLUS i_times(a(4, C, jb)) +# define GAMMA_AB2(C) a(2, C, jb) PLUS i_times(a(3, C, jb)) +# define GAMMA_AF1(C) a(1, C, jf) MINUS i_times(a(4, C, jf)) +# define GAMMA_AF2(C) a(2, C, jf) MINUS i_times(a(3, C, jf)) +# define GAMMA_B3(C) MINUS i_times(bb2_ ## C) PLUS i_times(bf2_ ## C) +# define GAMMA_B4(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C) +#endif + +#ifdef DIR_Y +# define GAMMA_AB1(C) a(1, C, jb) PLUS a(4, C, jb) +# define GAMMA_AB2(C) a(2, C, jb) MINUS a(3, C, jb) +# define GAMMA_AF1(C) a(1, C, jf) MINUS a(4, C, jf) +# define GAMMA_AF2(C) a(2, C, jf) PLUS a(3, C, jf) +# define GAMMA_B3(C) MINUS bb2_ ## C PLUS bf2_ ## C +# define GAMMA_B4(C) PLUS bb1_ ## C MINUS bf1_ ## C +#endif + +#ifdef DIR_Z +# define GAMMA_AB1(C) a(1, C, jb) PLUS i_times(a(3, C, jb)) +# define GAMMA_AB2(C) a(2, C, jb) MINUS i_times(a(4, C, jb)) +# define GAMMA_AF1(C) a(1, C, jf) MINUS i_times(a(3, C, jf)) +# define GAMMA_AF2(C) a(2, C, jf) PLUS i_times(a(4, C, jf)) +# define GAMMA_B3(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C) +# define GAMMA_B4(C) PLUS i_times(bb2_ ## C) MINUS i_times(bf2_ ## C) +#endif + +# define GAMMA_B1(C) + bb1_ ## C + bf1_ ## C +# define GAMMA_B2(C) + bb2_ ## C + bf2_ ## C + +#ifdef INIT +# define UPDATE_B(S, C) b(S, C, i) = GAMMA_B ## S ## (C) +#else +# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) GAMMA_B ## S ## (C) +#endif + +#endif + +#ifdef INIT +# define THE_NAME STRCAT(NAME, _0) +#else +# define THE_NAME NAME +#endif + +!------------------------------------------------------------------------------- +subroutine THE_NAME(b, a, u_e, u_o, nn_fwd, nn_bwd, i1, i2) + + implicit none + + COMPLEX, dimension (NDIRAC, NCOL, *), intent(inout) :: b + COMPLEX, dimension (NDIRAC, NCOL, *), intent(in) :: a + COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u_e, u_o + INTEGER, dimension (*), intent(in) :: nn_fwd, nn_bwd + integer :: i1, i2 + + integer :: i, jf, jb + + COMPLEX :: ab1, ab2, af1, af2 + COMPLEX :: bf1_1, bf2_1 + COMPLEX :: bf1_2, bf2_2 + COMPLEX :: bf1_3, bf2_3 + COMPLEX :: bb1_1, bb2_1 + COMPLEX :: bb1_2, bb2_2 + COMPLEX :: bb1_3, bb2_3 + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + !!TIMING_START(STRCAT(timing_bin_, NAME)) + + do i = i1, i2 + jb = nn_bwd(i) + + ab1 = GAMMA_AB1(1) + ab2 = GAMMA_AB2(1) + + bb1_1 = ab1 * conjg(u_o(1, 1, jb)) + bb2_1 = ab2 * conjg(u_o(1, 1, jb)) + bb1_2 = ab1 * conjg(u_o(1, 2, jb)) + bb2_2 = ab2 * conjg(u_o(1, 2, jb)) + bb1_3 = ab1 * conjg(u_o(1, 3, jb)) + bb2_3 = ab2 * conjg(u_o(1, 3, jb)) + + jf = nn_fwd(i) + + af1 = GAMMA_AF1(1) + af2 = GAMMA_AF2(1) + + bf1_1 = af1 * u_e(1, 1, i) + bf2_1 = af2 * u_e(1, 1, i) + bf1_2 = af1 * u_e(2, 1, i) + bf2_2 = af2 * u_e(2, 1, i) + bf1_3 = af1 * u_e(3, 1, i) + bf2_3 = af2 * u_e(3, 1, i) + + ab1 = GAMMA_AB1(2) + ab2 = GAMMA_AB2(2) + + bb1_1 = bb1_1 + ab1 * conjg(u_o(2, 1, jb)) + bb2_1 = bb2_1 + ab2 * conjg(u_o(2, 1, jb)) + bb1_2 = bb1_2 + ab1 * conjg(u_o(2, 2, jb)) + bb2_2 = bb2_2 + ab2 * conjg(u_o(2, 2, jb)) + bb1_3 = bb1_3 + ab1 * conjg(u_o(2, 3, jb)) + bb2_3 = bb2_3 + ab2 * conjg(u_o(2, 3, jb)) + + af1 = GAMMA_AF1(2) + af2 = GAMMA_AF2(2) + + bf1_1 = bf1_1 + af1 * u_e(1, 2, i) + bf2_1 = bf2_1 + af2 * u_e(1, 2, i) + bf1_2 = bf1_2 + af1 * u_e(2, 2, i) + bf2_2 = bf2_2 + af2 * u_e(2, 2, i) + bf1_3 = bf1_3 + af1 * u_e(3, 2, i) + bf2_3 = bf2_3 + af2 * u_e(3, 2, i) + + ab1 = GAMMA_AB1(3) + ab2 = GAMMA_AB2(3) + + bb1_1 = bb1_1 + ab1 * conjg(u_o(3, 1, jb)) + bb2_1 = bb2_1 + ab2 * conjg(u_o(3, 1, jb)) + bb1_2 = bb1_2 + ab1 * conjg(u_o(3, 2, jb)) + bb2_2 = bb2_2 + ab2 * conjg(u_o(3, 2, jb)) + bb1_3 = bb1_3 + ab1 * conjg(u_o(3, 3, jb)) + bb2_3 = bb2_3 + ab2 * conjg(u_o(3, 3, jb)) + + af1 = GAMMA_AF1(3) + af2 = GAMMA_AF2(3) + + bf1_1 = bf1_1 + af1 * u_e(1, 3, i) + bf2_1 = bf2_1 + af2 * u_e(1, 3, i) + bf1_2 = bf1_2 + af1 * u_e(2, 3, i) + bf2_2 = bf2_2 + af2 * u_e(2, 3, i) + bf1_3 = bf1_3 + af1 * u_e(3, 3, i) + bf2_3 = bf2_3 + af2 * u_e(3, 3, i) + + + UPDATE_B(1, 1) + UPDATE_B(2, 1) + UPDATE_B(3, 1) + UPDATE_B(4, 1) + + UPDATE_B(1, 2) + UPDATE_B(2, 2) + UPDATE_B(3, 2) + UPDATE_B(4, 2) + + UPDATE_B(1, 3) + UPDATE_B(2, 3) + UPDATE_B(3, 3) + UPDATE_B(4, 3) + + enddo + + !!TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D3xyzt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D3xyzt.F90 new file mode 100644 index 0000000000000000000000000000000000000000..391e3189644197b1eeb0d29c3db87ed3d60d579f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D3xyzt.F90 @@ -0,0 +1,203 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! D3xyzt.F90 - routines needed in D3.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +#ifdef DIR_T + +#ifdef DAGGER +# define GAMMA_AB1(C) a(3, C, jb) +# define GAMMA_AB2(C) a(4, C, jb) +# define GAMMA_AF1(C) a(1, C, jf) +# define GAMMA_AF2(C) a(2, C, jf) +# define GAMMA_B1(C) bf1_ ## C +# define GAMMA_B2(C) bf2_ ## C +# define GAMMA_B3(C) bb1_ ## C +# define GAMMA_B4(C) bb2_ ## C +#else +# define GAMMA_AB1(C) a(1, C, jb) +# define GAMMA_AB2(C) a(2, C, jb) +# define GAMMA_AF1(C) a(3, C, jf) +# define GAMMA_AF2(C) a(4, C, jf) +# define GAMMA_B1(C) bb1_ ## C +# define GAMMA_B2(C) bb2_ ## C +# define GAMMA_B3(C) bf1_ ## C +# define GAMMA_B4(C) bf2_ ## C +#endif + +# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) + TWO * GAMMA_B ## S ## (C) + +#else + +#ifdef DAGGER +# define PLUS - +# define MINUS + +#else +# define PLUS + +# define MINUS - +#endif + +#ifdef DIR_X +# define GAMMA_AB1(C) a(1, C, jb) PLUS i_times(a(4, C, jb)) +# define GAMMA_AB2(C) a(2, C, jb) PLUS i_times(a(3, C, jb)) +# define GAMMA_AF1(C) a(1, C, jf) MINUS i_times(a(4, C, jf)) +# define GAMMA_AF2(C) a(2, C, jf) MINUS i_times(a(3, C, jf)) +# define GAMMA_B3(C) MINUS i_times(bb2_ ## C) PLUS i_times(bf2_ ## C) +# define GAMMA_B4(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C) + +# define UPDATE_B(S, C) b(S, C, i) = GAMMA_B ## S ## (C) +#endif + +#ifdef DIR_Y +# define GAMMA_AB1(C) a(1, C, jb) PLUS a(4, C, jb) +# define GAMMA_AB2(C) a(2, C, jb) MINUS a(3, C, jb) +# define GAMMA_AF1(C) a(1, C, jf) MINUS a(4, C, jf) +# define GAMMA_AF2(C) a(2, C, jf) PLUS a(3, C, jf) +# define GAMMA_B3(C) MINUS bb2_ ## C PLUS bf2_ ## C +# define GAMMA_B4(C) PLUS bb1_ ## C MINUS bf1_ ## C + +# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) GAMMA_B ## S ## (C) +#endif + +#ifdef DIR_Z +# define GAMMA_AB1(C) a(1, C, jb) PLUS i_times(a(3, C, jb)) +# define GAMMA_AB2(C) a(2, C, jb) MINUS i_times(a(4, C, jb)) +# define GAMMA_AF1(C) a(1, C, jf) MINUS i_times(a(3, C, jf)) +# define GAMMA_AF2(C) a(2, C, jf) PLUS i_times(a(4, C, jf)) +# define GAMMA_B3(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C) +# define GAMMA_B4(C) PLUS i_times(bb2_ ## C) MINUS i_times(bf2_ ## C) + +# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) GAMMA_B ## S ## (C) +#endif + +# define GAMMA_B1(C) + bb1_ ## C + bf1_ ## C +# define GAMMA_B2(C) + bb2_ ## C + bf2_ ## C + +#endif + +!------------------------------------------------------------------------------- +subroutine NAME(b, a, u_e, u_o, nn_fwd, nn_bwd, i1, i2) + + implicit none + + COMPLEX, dimension (NDIRAC, NCOL, *), intent(inout) :: b + COMPLEX, dimension (NDIRAC, NCOL, *), intent(in) :: a + COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u_e, u_o + INTEGER, dimension (*), intent(in) :: nn_fwd, nn_bwd + integer :: i1, i2 + + integer :: i, jf, jb + + COMPLEX :: ab1, ab2, af1, af2 + COMPLEX :: bf1_1, bf2_1 + COMPLEX :: bf1_2, bf2_2 + COMPLEX :: bf1_3, bf2_3 + COMPLEX :: bb1_1, bb2_1 + COMPLEX :: bb1_2, bb2_2 + COMPLEX :: bb1_3, bb2_3 + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + !!TIMING_START(STRCAT(timing_bin_, NAME)) + + do i = i1, i2 + jb = nn_bwd(i) + + ab1 = GAMMA_AB1(1) + ab2 = GAMMA_AB2(1) + + bb1_1 = ab1 * conjg(u_o(1, 1, jb)) + bb2_1 = ab2 * conjg(u_o(1, 1, jb)) + bb1_2 = ab1 * conjg(u_o(1, 2, jb)) + bb2_2 = ab2 * conjg(u_o(1, 2, jb)) + bb1_3 = ab1 * conjg(u_o(1, 3, jb)) + bb2_3 = ab2 * conjg(u_o(1, 3, jb)) + + jf = nn_fwd(i) + + af1 = GAMMA_AF1(1) + af2 = GAMMA_AF2(1) + + bf1_1 = af1 * u_e(1, 1, i) + bf2_1 = af2 * u_e(1, 1, i) + bf1_2 = af1 * u_e(2, 1, i) + bf2_2 = af2 * u_e(2, 1, i) + bf1_3 = af1 * u_e(3, 1, i) + bf2_3 = af2 * u_e(3, 1, i) + + ab1 = GAMMA_AB1(2) + ab2 = GAMMA_AB2(2) + + bb1_1 = bb1_1 + ab1 * conjg(u_o(2, 1, jb)) + bb2_1 = bb2_1 + ab2 * conjg(u_o(2, 1, jb)) + bb1_2 = bb1_2 + ab1 * conjg(u_o(2, 2, jb)) + bb2_2 = bb2_2 + ab2 * conjg(u_o(2, 2, jb)) + bb1_3 = bb1_3 + ab1 * conjg(u_o(2, 3, jb)) + bb2_3 = bb2_3 + ab2 * conjg(u_o(2, 3, jb)) + + af1 = GAMMA_AF1(2) + af2 = GAMMA_AF2(2) + + bf1_1 = bf1_1 + af1 * u_e(1, 2, i) + bf2_1 = bf2_1 + af2 * u_e(1, 2, i) + bf1_2 = bf1_2 + af1 * u_e(2, 2, i) + bf2_2 = bf2_2 + af2 * u_e(2, 2, i) + bf1_3 = bf1_3 + af1 * u_e(3, 2, i) + bf2_3 = bf2_3 + af2 * u_e(3, 2, i) + + ab1 = GAMMA_AB1(3) + ab2 = GAMMA_AB2(3) + + bb1_1 = bb1_1 + ab1 * conjg(u_o(3, 1, jb)) + bb2_1 = bb2_1 + ab2 * conjg(u_o(3, 1, jb)) + bb1_2 = bb1_2 + ab1 * conjg(u_o(3, 2, jb)) + bb2_2 = bb2_2 + ab2 * conjg(u_o(3, 2, jb)) + bb1_3 = bb1_3 + ab1 * conjg(u_o(3, 3, jb)) + bb2_3 = bb2_3 + ab2 * conjg(u_o(3, 3, jb)) + + af1 = GAMMA_AF1(3) + af2 = GAMMA_AF2(3) + + bf1_1 = bf1_1 + af1 * u_e(1, 3, i) + bf2_1 = bf2_1 + af2 * u_e(1, 3, i) + bf1_2 = bf1_2 + af1 * u_e(2, 3, i) + bf2_2 = bf2_2 + af2 * u_e(2, 3, i) + bf1_3 = bf1_3 + af1 * u_e(3, 3, i) + bf2_3 = bf2_3 + af2 * u_e(3, 3, i) + + + UPDATE_B(1, 1) + UPDATE_B(2, 1) + UPDATE_B(3, 1) + UPDATE_B(4, 1) + + UPDATE_B(1, 2) + UPDATE_B(2, 2) + UPDATE_B(3, 2) + UPDATE_B(4, 2) + + UPDATE_B(1, 3) + UPDATE_B(2, 3) + UPDATE_B(3, 3) + UPDATE_B(4, 3) + + enddo + + !!TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/DSFxyzt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/DSFxyzt.F90 new file mode 100644 index 0000000000000000000000000000000000000000..23b0d6fb9600d4b9f4411d95e119babf591e61ea --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/DSFxyzt.F90 @@ -0,0 +1,108 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! DSFxyzt.F90 - routines (for standard Wilson fermions) needed in dsf.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +#ifdef DIR_X +# define GAMMA_A1(C) a(1, C, J) minus i_times(a(4, C, J)) +# define GAMMA_A2(C) a(2, C, J) minus i_times(a(3, C, J)) +# define GAMMA_A3(C) plus i_times(a2) +# define GAMMA_A4(C) plus i_times(a1) +#endif + +#ifdef DIR_Y +# define GAMMA_A1(C) a(1, C, J) minus a(4, C, J) +# define GAMMA_A2(C) a(2, C, J) plus a(3, C, J) +# define GAMMA_A3(C) plus a2 +# define GAMMA_A4(C) minus a1 +#endif + +#ifdef DIR_Z +# define GAMMA_A1(C) a(1, C, J) minus i_times(a(3, C, J)) +# define GAMMA_A2(C) a(2, C, J) plus i_times(a(4, C, J)) +# define GAMMA_A3(C) plus i_times(a1) +# define GAMMA_A4(C) minus i_times(a2) +#endif + +#ifdef DIR_T +#ifdef FORWARD +# define GAMMA_A1(C) ZERO +# define GAMMA_A2(C) ZERO +# define GAMMA_A3(C) TWO * a(3, C, J) +# define GAMMA_A4(C) TWO * a(4, C, J) +#else +# define GAMMA_A1(C) TWO * a(1, C, J) +# define GAMMA_A2(C) TWO * a(2, C, J) +# define GAMMA_A3(C) ZERO +# define GAMMA_A4(C) ZERO +#endif +#endif + +#ifdef FORWARD +# define UU(R, A, B) uu(R, A, B) +# define plus + +# define minus - +# define I i +# define J j +#else +# define UU(R, A, B) uud(R, B, A) +# define plus - +# define minus + +# define I j +# define J i +#endif + +!------------------------------------------------------------------------------- +subroutine NAME(p, b, a, s, u, nn, volh) + + implicit none + + REAL, dimension(NGEN, *), intent(inout) :: p + REAL, intent(in) :: s + COMPLEX, dimension (NDIRAC, NCOL, *), intent(in) :: b, a + COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u + INTEGER, intent(in) :: nn(*) + integer :: volh + + integer :: i, j, ca, cb + COMPLEX :: a1, a2, a3, a4 + SU3 :: v, w + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + + !$omp parallel do private(j,ca,cb,a1,a2,a3,a4,w,v) + do i = 1, volh + j = nn(i) + do ca = 1, NCOL + a1 = GAMMA_A1(ca) + a2 = GAMMA_A2(ca) + a3 = GAMMA_A3(ca) + a4 = GAMMA_A4(ca) + do cb = 1, NCOL + w(ca, cb) = a1 * conjg(b(1, cb, I)) & + + a2 * conjg(b(2, cb, I)) & + + a3 * conjg(b(3, cb, I)) & + + a4 * conjg(b(4, cb, I)) + enddo + enddo + call UU(v, u(1, 1, i), w) + call im_tr_j(p(1, i), v, minus s) + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/DVersion.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/DVersion.F90 new file mode 100644 index 0000000000000000000000000000000000000000..7748291d4c1e1e9861b486f687425a94c53a4ef2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/DVersion.F90 @@ -0,0 +1,20 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! DVersion.F90 +! +!------------------------------------------------------------------------------- + +integer function version_of_d() + implicit none + version_of_d = VERSION +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/Dt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/Dt.F90 new file mode 100644 index 0000000000000000000000000000000000000000..28f6232e7ad1083562c3dca27fac0b0c49445739 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/Dt.F90 @@ -0,0 +1,154 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! Dt.F90 - routines needed in D.F90 (t-direction) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +#ifdef DAGGER +# define GAMMA_A1(C) a(3, C, jb) +# define GAMMA_A2(C) a(4, C, jb) +# define GAMMA_A3(C) a(1, C, jf) +# define GAMMA_A4(C) a(2, C, jf) +# define GAMMA_B1(C) f1_ ## C +# define GAMMA_B2(C) f2_ ## C +# define GAMMA_B3(C) b1_ ## C +# define GAMMA_B4(C) b2_ ## C +#else +# define GAMMA_A1(C) a(1, C, jb) +# define GAMMA_A2(C) a(2, C, jb) +# define GAMMA_A3(C) a(3, C, jf) +# define GAMMA_A4(C) a(4, C, jf) +# define GAMMA_B1(C) b1_ ## C +# define GAMMA_B2(C) b2_ ## C +# define GAMMA_B3(C) f1_ ## C +# define GAMMA_B4(C) f2_ ## C +#endif + +!------------------------------------------------------------------------------- +subroutine NAME(b, a, u_e, u_o, nn_fwd, nn_bwd, volh) + + implicit none + + COMPLEX, dimension (NDIRAC, NCOL, *), intent(inout) :: b + COMPLEX, dimension (NDIRAC, NCOL, *), intent(in) :: a + COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u_e, u_o + INTEGER, dimension (*), intent(in) :: nn_fwd, nn_bwd + integer :: volh + + integer :: i, jf, jb + + COMPLEX :: a1, a2, a3, a4 + COMPLEX :: f1_1, f2_1 + COMPLEX :: f1_2, f2_2 + COMPLEX :: f1_3, f2_3 + COMPLEX :: b1_1, b2_1 + COMPLEX :: b1_2, b2_2 + COMPLEX :: b1_3, b2_3 + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + TIMING_START(STRCAT(timing_bin_, NAME)) + + !$omp parallel do private(jf, jb, a1, a2, a3, a4, & + !$omp f1_1, f2_1, f1_2, f2_2, f1_3, f2_3, & + !$omp b1_1, b2_1, b1_2, b2_2, b1_3, b2_3) + do i = 1, volh + jb = nn_bwd(i) + + a1 = GAMMA_A1(1) + a2 = GAMMA_A2(1) + + b1_1 = a1 * conjg(u_o(1, 1, jb)) + b2_1 = a2 * conjg(u_o(1, 1, jb)) + b1_2 = a1 * conjg(u_o(1, 2, jb)) + b2_2 = a2 * conjg(u_o(1, 2, jb)) + b1_3 = a1 * conjg(u_o(1, 3, jb)) + b2_3 = a2 * conjg(u_o(1, 3, jb)) + + jf = nn_fwd(i) + + a3 = GAMMA_A3(1) + a4 = GAMMA_A4(1) + + f1_1 = a3 * u_e(1, 1, i) + f2_1 = a4 * u_e(1, 1, i) + f1_2 = a3 * u_e(2, 1, i) + f2_2 = a4 * u_e(2, 1, i) + f1_3 = a3 * u_e(3, 1, i) + f2_3 = a4 * u_e(3, 1, i) + + a1 = GAMMA_A1(2) + a2 = GAMMA_A2(2) + + b1_1 = b1_1 + a1 * conjg(u_o(2, 1, jb)) + b2_1 = b2_1 + a2 * conjg(u_o(2, 1, jb)) + b1_2 = b1_2 + a1 * conjg(u_o(2, 2, jb)) + b2_2 = b2_2 + a2 * conjg(u_o(2, 2, jb)) + b1_3 = b1_3 + a1 * conjg(u_o(2, 3, jb)) + b2_3 = b2_3 + a2 * conjg(u_o(2, 3, jb)) + + a3 = GAMMA_A3(2) + a4 = GAMMA_A4(2) + + f1_1 = f1_1 + a3 * u_e(1, 2, i) + f2_1 = f2_1 + a4 * u_e(1, 2, i) + f1_2 = f1_2 + a3 * u_e(2, 2, i) + f2_2 = f2_2 + a4 * u_e(2, 2, i) + f1_3 = f1_3 + a3 * u_e(3, 2, i) + f2_3 = f2_3 + a4 * u_e(3, 2, i) + + a1 = GAMMA_A1(3) + a2 = GAMMA_A2(3) + + b1_1 = b1_1 + a1 * conjg(u_o(3, 1, jb)) + b2_1 = b2_1 + a2 * conjg(u_o(3, 1, jb)) + b1_2 = b1_2 + a1 * conjg(u_o(3, 2, jb)) + b2_2 = b2_2 + a2 * conjg(u_o(3, 2, jb)) + b1_3 = b1_3 + a1 * conjg(u_o(3, 3, jb)) + b2_3 = b2_3 + a2 * conjg(u_o(3, 3, jb)) + + a3 = GAMMA_A3(3) + a4 = GAMMA_A4(3) + + f1_1 = f1_1 + a3 * u_e(1, 3, i) + f2_1 = f2_1 + a4 * u_e(1, 3, i) + f1_2 = f1_2 + a3 * u_e(2, 3, i) + f2_2 = f2_2 + a4 * u_e(2, 3, i) + f1_3 = f1_3 + a3 * u_e(3, 3, i) + f2_3 = f2_3 + a4 * u_e(3, 3, i) + + + b(1, 1, i) = TWO * GAMMA_B1(1) + b(2, 1, i) = TWO * GAMMA_B2(1) + b(3, 1, i) = TWO * GAMMA_B3(1) + b(4, 1, i) = TWO * GAMMA_B4(1) + + b(1, 2, i) = TWO * GAMMA_B1(2) + b(2, 2, i) = TWO * GAMMA_B2(2) + b(3, 2, i) = TWO * GAMMA_B3(2) + b(4, 2, i) = TWO * GAMMA_B4(2) + + b(1, 3, i) = TWO * GAMMA_B1(3) + b(2, 3, i) = TWO * GAMMA_B2(3) + b(3, 3, i) = TWO * GAMMA_B3(3) + b(4, 3, i) = TWO * GAMMA_B4(3) + + enddo + + TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/Dxyz.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/Dxyz.F90 new file mode 100644 index 0000000000000000000000000000000000000000..a60d023a9327832b816e1a5eb2b0b7f6b282acd7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/Dxyz.F90 @@ -0,0 +1,137 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! Dxyz.F90 - routines needed in D.F90 (x/y/z-directions) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +#ifdef DIR_X +# define GAMMA_A1(C) a(1, C, j) minus i_times(a(4, C, j)) +# define GAMMA_A2(C) a(2, C, j) minus i_times(a(3, C, j)) +# define GAMMA_B1(C) b(3, C, i) plus i_times(b2_ ## C) +# define GAMMA_B2(C) b(4, C, i) plus i_times(b1_ ## C) +#endif + +#ifdef DIR_Y +# define GAMMA_A1(C) a(1, C, j) minus a(4, C, j) +# define GAMMA_A2(C) a(2, C, j) plus a(3, C, j) +# define GAMMA_B1(C) b(3, C, i) plus b2_ ## C +# define GAMMA_B2(C) b(4, C, i) minus b1_ ## C +#endif + +#ifdef DIR_Z +# define GAMMA_A1(C) a(1, C, j) minus i_times(a(3, C, j)) +# define GAMMA_A2(C) a(2, C, j) plus i_times(a(4, C, j)) +# define GAMMA_B1(C) b(3, C, i) plus i_times(b1_ ## C) +# define GAMMA_B2(C) b(4, C, i) minus i_times(b2_ ## C) +#endif + +#ifdef FORWARD +# define U(A, B) u(A, B, i) +# define minus MINUS +# define plus PLUS +#else +# define U(A, B) conjg(u(B, A, j)) +# define minus PLUS +# define plus MINUS +#endif + +#ifdef DAGGER +# define PLUS - +# define MINUS + +#else +# define PLUS + +# define MINUS - +#endif + +!------------------------------------------------------------------------------- +subroutine NAME(b, a, u, nn, volh) + + implicit none + + COMPLEX, dimension (NDIRAC, NCOL, *), intent(inout) :: b + COMPLEX, dimension (NDIRAC, NCOL, *), intent(in) :: a + COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u + INTEGER, dimension (*), intent(in) :: nn + integer :: volh + + integer :: i, j + + COMPLEX :: a1, a2 + COMPLEX :: b1_1, b2_1 + COMPLEX :: b1_2, b2_2 + COMPLEX :: b1_3, b2_3 + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + TIMING_START(STRCAT(timing_bin_, NAME)) + + !$omp parallel do private(j, a1, a2, b1_1, b2_1, b1_2, b2_2, b1_3, b2_3) + do i = 1, volh + j = nn(i) + + a1 = GAMMA_A1(1) + a2 = GAMMA_A2(1) + + b1_1 = a1 * U(1, 1) + b2_1 = a2 * U(1, 1) + b1_2 = a1 * U(2, 1) + b2_2 = a2 * U(2, 1) + b1_3 = a1 * U(3, 1) + b2_3 = a2 * U(3, 1) + + a1 = GAMMA_A1(2) + a2 = GAMMA_A2(2) + + b1_1 = b1_1 + a1 * U(1, 2) + b2_1 = b2_1 + a2 * U(1, 2) + b1_2 = b1_2 + a1 * U(2, 2) + b2_2 = b2_2 + a2 * U(2, 2) + b1_3 = b1_3 + a1 * U(3, 2) + b2_3 = b2_3 + a2 * U(3, 2) + + a1 = GAMMA_A1(3) + a2 = GAMMA_A2(3) + + b1_1 = b1_1 + a1 * U(1, 3) + b2_1 = b2_1 + a2 * U(1, 3) + + b(1, 1, i) = b(1, 1, i) + b1_1 + b(2, 1, i) = b(2, 1, i) + b2_1 + b(3, 1, i) = GAMMA_B1(1) + b(4, 1, i) = GAMMA_B2(1) + + b1_2 = b1_2 + a1 * U(2, 3) + b2_2 = b2_2 + a2 * U(2, 3) + + b(1, 2, i) = b(1, 2, i) + b1_2 + b(2, 2, i) = b(2, 2, i) + b2_2 + b(3, 2, i) = GAMMA_B1(2) + b(4, 2, i) = GAMMA_B2(2) + + b1_3 = b1_3 + a1 * U(3, 3) + b2_3 = b2_3 + a2 * U(3, 3) + + b(1, 3, i) = b(1, 3, i) + b1_3 + b(2, 3, i) = b(2, 3, i) + b2_3 + b(3, 3, i) = GAMMA_B1(3) + b(4, 3, i) = GAMMA_B2(3) + + enddo + + TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_A/d/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..64a19ab5e5d6b7c1cd34db7eab15e53845cb71be --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/Makefile @@ -0,0 +1,335 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# d/Makefile +# +#=============================================================================== + +include ../Makefile.defs + +fpp = $(FPP) -I.. $(FPPFLAGS) + +MODULES_DIR = ../modules + +.SUFFIXES: +.SUFFIXES: .a .o .f90 .F90 + +.f90.o: + $(F90) -c $(FFLAGS) -I$(MODULES_DIR) $< + +OBJS_D = \ + d.o \ + d_t.o \ + d_zb.o \ + d_zf.o \ + d_yb.o \ + d_yf.o \ + d_xb.o \ + d_xf.o \ + d_dag.o \ + d_dag_t.o \ + d_dag_zb.o \ + d_dag_zf.o \ + d_dag_yb.o \ + d_dag_yf.o \ + d_dag_xb.o \ + d_dag_xf.o \ + d_version.o + +OBJS_D2 = \ + d2.o \ + d2_t.o \ + d2_zf.o \ + d2_yf.o \ + d2_xf.o \ + d2_dag.o \ + d2_dag_t.o \ + d2_dag_zf.o \ + d2_dag_yf.o \ + d2_dag_xf.o \ + d2_version.o + +OBJS_D21 = \ + d21.o \ + d21_t.o \ + d21_zf.o \ + d21_yf.o \ + d21_xf.o \ + d21_dag.o \ + d21_dag_t.o \ + d21_dag_zf.o \ + d21_dag_yf.o \ + d21_dag_xf.o \ + d21_version.o + +OBJS_D3 = \ + d3.o \ + d3_t.o \ + d3_zf.o \ + d3_yf.o \ + d3_xf.o \ + d3_dag.o \ + d3_dag_t.o \ + d3_dag_zf.o \ + d3_dag_yf.o \ + d3_dag_xf.o \ + d3_version.o + +OBJS_D31 = \ + d31.o \ + d31_switch.o \ + d31_switch_0.o \ + d31_t.o \ + d31_t_0.o \ + d31_zf.o \ + d31_zf_0.o \ + d31_yf.o \ + d31_yf_0.o \ + d31_xf.o \ + d31_xf_0.o \ + d31_dag.o \ + d31_dag_switch.o \ + d31_dag_switch_0.o \ + d31_dag_t.o \ + d31_dag_t_0.o \ + d31_dag_zf.o \ + d31_dag_zf_0.o \ + d31_dag_yf.o \ + d31_dag_yf_0.o \ + d31_dag_xf.o \ + d31_dag_xf_0.o \ + d31_version.o + +OBJS_DSF = \ + dsf_xf.o \ + dsf_xb.o \ + dsf_yf.o \ + dsf_yb.o \ + dsf_zf.o \ + dsf_zb.o \ + dsf_tf.o \ + dsf_tb.o + + +#------------------------------------------------------------------------------- +$(LIBD): + +libd.a: $(OBJS_D) $(OBJS_DSF) + $(AR) $(ARFLAGS) $@ $(OBJS_D) $(OBJS_DSF) + $(RANLIB) $@ + +libd2.a: $(OBJS_D2) $(OBJS_DSF) + $(AR) $(ARFLAGS) $@ $(OBJS_D2) $(OBJS_DSF) + $(RANLIB) $@ + +libd21.a: $(OBJS_D21) $(OBJS_DSF) + $(AR) $(ARFLAGS) $@ $(OBJS_D21) $(OBJS_DSF) + $(RANLIB) $@ + +libd3.a: $(OBJS_D3) $(OBJS_DSF) + $(AR) $(ARFLAGS) $@ $(OBJS_D3) $(OBJS_DSF) + $(RANLIB) $@ + +libd31.a: $(OBJS_D31) $(OBJS_DSF) + $(AR) $(ARFLAGS) $@ $(OBJS_D31) $(OBJS_DSF) + $(RANLIB) $@ + +fast: + $(FAST_MAKE) + +#------------------------------------------------------------------------------- +d.f90: D.F90 $(DEPENDENCIES_D) + $(fpp) -DNAME=d D.F90 > $@ + +d_dag.f90: D.F90 $(DEPENDENCIES_D) + $(fpp) -DNAME=d_dag D.F90 > $@ + +d_xf.f90: Dxyz.F90 + $(fpp) -DNAME=d_xf -DDIR_X -DFORWARD -UDAGGER Dxyz.F90 > $@ + +d_yf.f90: Dxyz.F90 + $(fpp) -DNAME=d_yf -DDIR_Y -DFORWARD -UDAGGER Dxyz.F90 > $@ + +d_zf.f90: Dxyz.F90 + $(fpp) -DNAME=d_zf -DDIR_Z -DFORWARD -UDAGGER Dxyz.F90 > $@ + +d_dag_xf.f90: Dxyz.F90 + $(fpp) -DNAME=d_dag_xf -DDIR_X -DFORWARD -DDAGGER Dxyz.F90 > $@ + +d_dag_yf.f90: Dxyz.F90 + $(fpp) -DNAME=d_dag_yf -DDIR_Y -DFORWARD -DDAGGER Dxyz.F90 > $@ + +d_dag_zf.f90: Dxyz.F90 + $(fpp) -DNAME=d_dag_zf -DDIR_Z -DFORWARD -DDAGGER Dxyz.F90 > $@ + +d_xb.f90: Dxyz.F90 + $(fpp) -DNAME=d_xb -DDIR_X -UFORWARD -UDAGGER Dxyz.F90 > $@ + +d_yb.f90: Dxyz.F90 + $(fpp) -DNAME=d_yb -DDIR_Y -UFORWARD -UDAGGER Dxyz.F90 > $@ + +d_zb.f90: Dxyz.F90 + $(fpp) -DNAME=d_zb -DDIR_Z -UFORWARD -UDAGGER Dxyz.F90 > $@ + +d_dag_xb.f90: Dxyz.F90 + $(fpp) -DNAME=d_dag_xb -DDIR_X -UFORWARD -DDAGGER Dxyz.F90 > $@ + +d_dag_yb.f90: Dxyz.F90 + $(fpp) -DNAME=d_dag_yb -DDIR_Y -UFORWARD -DDAGGER Dxyz.F90 > $@ + +d_dag_zb.f90: Dxyz.F90 + $(fpp) -DNAME=d_dag_zb -DDIR_Z -UFORWARD -DDAGGER Dxyz.F90 > $@ + +d_t.f90: Dt.F90 + $(fpp) -DNAME=d_t -UDAGGER Dt.F90 > $@ + +d_dag_t.f90: Dt.F90 + $(fpp) -DNAME=d_dag_t -DDAGGER Dt.F90 > $@ + +d_version.f90: DVersion.F90 + $(fpp) -DVERSION=1 DVersion.F90 > $@ + + +#------------------------------------------------------------------------------- +d2.f90: D2.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d D2.F90 > $@ + +d2_dag.f90: D2.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d_dag D2.F90 > $@ + +d2_xf.f90: D2xyzt.F90; $(fpp) -DNAME=d_xf -DDIR_X -UDAGGER D2xyzt.F90 > $@ + +d2_yf.f90: D2xyzt.F90; $(fpp) -DNAME=d_yf -DDIR_Y -UDAGGER D2xyzt.F90 > $@ + +d2_zf.f90: D2xyzt.F90; $(fpp) -DNAME=d_zf -DDIR_Z -UDAGGER D2xyzt.F90 > $@ + +d2_dag_xf.f90: D2xyzt.F90; $(fpp) -DNAME=d_dag_xf -DDIR_X -DDAGGER D2xyzt.F90 > $@ + +d2_dag_yf.f90: D2xyzt.F90; $(fpp) -DNAME=d_dag_yf -DDIR_Y -DDAGGER D2xyzt.F90 > $@ + +d2_dag_zf.f90: D2xyzt.F90; $(fpp) -DNAME=d_dag_zf -DDIR_Z -DDAGGER D2xyzt.F90 > $@ + +d2_t.f90: D2xyzt.F90; $(fpp) -DNAME=d_t -DDIR_T -UDAGGER D2xyzt.F90 > $@ + +d2_dag_t.f90: D2xyzt.F90; $(fpp) -DNAME=d_dag_t -DDIR_T -DDAGGER D2xyzt.F90 > $@ + +d2_version.f90: DVersion.F90; $(fpp) -DVERSION=2 DVersion.F90 > $@ + + +#------------------------------------------------------------------------------- +d21.f90: D21.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d -UDAGGER D21.F90 > $@ + +d21_dag.f90: D21.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d_dag -DDAGGER D21.F90 > $@ + +d21_xf.f90: D21xyzt.F90; $(fpp) -DNAME=d_xf -DDIR_X -UDAGGER D21xyzt.F90 > $@ + +d21_yf.f90: D21xyzt.F90; $(fpp) -DNAME=d_yf -DDIR_Y -UDAGGER D21xyzt.F90 > $@ + +d21_zf.f90: D21xyzt.F90; $(fpp) -DNAME=d_zf -DDIR_Z -UDAGGER D21xyzt.F90 > $@ + +d21_dag_xf.f90: D21xyzt.F90; $(fpp) -DNAME=d_dag_xf -DDIR_X -DDAGGER D21xyzt.F90 > $@ + +d21_dag_yf.f90: D21xyzt.F90; $(fpp) -DNAME=d_dag_yf -DDIR_Y -DDAGGER D21xyzt.F90 > $@ + +d21_dag_zf.f90: D21xyzt.F90; $(fpp) -DNAME=d_dag_zf -DDIR_Z -DDAGGER D21xyzt.F90 > $@ + +d21_t.f90: D21xyzt.F90; $(fpp) -DNAME=d_t -DDIR_T -UDAGGER D21xyzt.F90 > $@ + +d21_dag_t.f90: D21xyzt.F90; $(fpp) -DNAME=d_dag_t -DDIR_T -DDAGGER D21xyzt.F90 > $@ + +d21_version.f90: DVersion.F90; $(fpp) -DVERSION=21 DVersion.F90 > $@ + + +#------------------------------------------------------------------------------- +d3.f90: D3.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d D3.F90 > $@ + +d3_dag.f90: D3.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d_dag D3.F90 > $@ + +d3_xf.f90: D3xyzt.F90; $(fpp) -DNAME=d_xf -DDIR_X -UDAGGER D3xyzt.F90 > $@ + +d3_yf.f90: D3xyzt.F90; $(fpp) -DNAME=d_yf -DDIR_Y -UDAGGER D3xyzt.F90 > $@ + +d3_zf.f90: D3xyzt.F90; $(fpp) -DNAME=d_zf -DDIR_Z -UDAGGER D3xyzt.F90 > $@ + +d3_dag_xf.f90: D3xyzt.F90; $(fpp) -DNAME=d_dag_xf -DDIR_X -DDAGGER D3xyzt.F90 > $@ + +d3_dag_yf.f90: D3xyzt.F90; $(fpp) -DNAME=d_dag_yf -DDIR_Y -DDAGGER D3xyzt.F90 > $@ + +d3_dag_zf.f90: D3xyzt.F90; $(fpp) -DNAME=d_dag_zf -DDIR_Z -DDAGGER D3xyzt.F90 > $@ + +d3_t.f90: D3xyzt.F90; $(fpp) -DNAME=d_t -DDIR_T -UDAGGER D3xyzt.F90 > $@ + +d3_dag_t.f90: D3xyzt.F90; $(fpp) -DNAME=d_dag_t -DDIR_T -DDAGGER D3xyzt.F90 > $@ + +d3_version.f90: DVersion.F90; $(fpp) -DVERSION=3 DVersion.F90 > $@ + + +#------------------------------------------------------------------------------- +d31.f90: D31.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d D31.F90 > $@ + +d31_dag.f90: D31.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d_dag D31.F90 > $@ + +d31_xf.f90: D31xyzt.F90; $(fpp) -DNAME=d_xf -DDIR_X -UDAGGER -UINIT D31xyzt.F90 > $@ +d31_xf_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_xf -DDIR_X -UDAGGER -DINIT D31xyzt.F90 > $@ + +d31_yf.f90: D31xyzt.F90; $(fpp) -DNAME=d_yf -DDIR_Y -UDAGGER -UINIT D31xyzt.F90 > $@ +d31_yf_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_yf -DDIR_Y -UDAGGER -DINIT D31xyzt.F90 > $@ + +d31_zf.f90: D31xyzt.F90; $(fpp) -DNAME=d_zf -DDIR_Z -UDAGGER -UINIT D31xyzt.F90 > $@ +d31_zf_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_zf -DDIR_Z -UDAGGER -DINIT D31xyzt.F90 > $@ + +d31_dag_xf.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_xf -DDIR_X -DDAGGER -UINIT D31xyzt.F90 > $@ +d31_dag_xf_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_xf -DDIR_X -DDAGGER -DINIT D31xyzt.F90 > $@ + +d31_dag_yf.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_yf -DDIR_Y -DDAGGER -UINIT D31xyzt.F90 > $@ +d31_dag_yf_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_yf -DDIR_Y -DDAGGER -DINIT D31xyzt.F90 > $@ + +d31_dag_zf.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_zf -DDIR_Z -DDAGGER -UINIT D31xyzt.F90 > $@ +d31_dag_zf_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_zf -DDIR_Z -DDAGGER -DINIT D31xyzt.F90 > $@ + +d31_t.f90: D31xyzt.F90; $(fpp) -DNAME=d_t -DDIR_T -UDAGGER -UINIT D31xyzt.F90 > $@ +d31_t_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_t -DDIR_T -UDAGGER -DINIT D31xyzt.F90 > $@ + +d31_dag_t.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_t -DDIR_T -DDAGGER -UINIT D31xyzt.F90 > $@ +d31_dag_t_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_t -DDIR_T -DDAGGER -DINIT D31xyzt.F90 > $@ + +d31_version.f90: DVersion.F90; $(fpp) -DVERSION=31 DVersion.F90 > $@ + +d31_switch.f90: D31_switch.F90; $(fpp) -DNAME=d -UINIT D31_switch.F90 > $@ +d31_switch_0.f90: D31_switch.F90; $(fpp) -DNAME=d -DINIT D31_switch.F90 > $@ + +d31_dag_switch.f90: D31_switch.F90; $(fpp) -DNAME=d_dag -UINIT D31_switch.F90 > $@ +d31_dag_switch_0.f90: D31_switch.F90; $(fpp) -DNAME=d_dag -DINIT D31_switch.F90 > $@ + + +#------------------------------------------------------------------------------- +dsf.f90: dsf.F90 $(DEPENDENCIES_DSF); $(fpp) dsf.F90 > $@ + +dsf_xf.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_xf -DDIR_X -DFORWARD DSFxyzt.F90 > $@ + +dsf_yf.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_yf -DDIR_Y -DFORWARD DSFxyzt.F90 > $@ + +dsf_zf.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_zf -DDIR_Z -DFORWARD DSFxyzt.F90 > $@ + +dsf_tf.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_tf -DDIR_T -DFORWARD DSFxyzt.F90 > $@ + +dsf_xb.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_xb -DDIR_X -UFORWARD DSFxyzt.F90 > $@ + +dsf_yb.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_yb -DDIR_Y -UFORWARD DSFxyzt.F90 > $@ + +dsf_zb.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_zb -DDIR_Z -UFORWARD DSFxyzt.F90 > $@ + +dsf_tb.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_tb -DDIR_T -UFORWARD DSFxyzt.F90 > $@ + + +#------------------------------------------------------------------------------- +clean: + rm -f *.[Tiod] *.f90 *.mod work.pc work.pcl + +clobber: clean + rm -f libd.a libd2.a libd21.a libd3.a libd31.a diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/bqcd.pcl b/qcd/part_cpu/applications/QCD/src/kernel_A/d/bqcd.pcl new file mode 100644 index 0000000000000000000000000000000000000000..906244500b31700684482c3dcfd32f6cec4279db --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/bqcd.pcl @@ -0,0 +1,2 @@ +work.pc +../modules/work.pc diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/defs.h b/qcd/part_cpu/applications/QCD/src/kernel_A/defs.h new file mode 100644 index 0000000000000000000000000000000000000000..05654d6e3e338d85e7aa8fc50ccbf893b7e1d222 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/defs.h @@ -0,0 +1,213 @@ +#ifndef BQCD_DEFS_H +#define BQCD_DEFS_H + +# define MAX_TEMPER 50 + +# define RKIND 8 + +# define BQCD_REAL mpi_real8 + +# define BQCD_CHECK_SUM mpi_integer8 +# define BQCD_SEED mpi_integer8 + +# define CHECK_SUM integer(8) +# define SEED integer(8) +# define SECONDS real(8) +# define COMM_METHOD character(40) + +#ifdef INTEL +# define RECL_UNIT 4 +#else +# define RECL_UNIT 1 +#endif + +# define DIM 4 +# define NCOL 3 +# define NDIRAC 4 +# define NGEN 8 +# define EVEN 0 +# define ODD 1 +# define FWD 0 +# define BWD 1 + +# define SIZE_COMPLEX 2 + +# define REAL real(RKIND) +# define INTEGER integer(4) +# define COMPLEX complex(RKIND) + +# define SU3 COMPLEX, dimension (NCOL, NCOL) +# define GENERATOR REAL, dimension (NGEN) + +# define GAUGE_FIELD_IO COMPLEX, dimension(NCOL, NCOL-1, DIM, 0:NX-1, 0:NY-1, 0:NZ-1, 0:NT-1) +# define SPINCOL_FIELD_IO COMPLEX, dimension(NDIRAC, NCOL, 0:NXH-1, 0:NY-1, 0:NZ-1, 0:NT-1) + +# define SU3_FIELD COMPLEX, dimension (NCOL, NCOL, volh_tot) +# define GAUGE_FIELD COMPLEX, dimension (NCOL, NCOL, volh_tot, EVEN:ODD, DIM) +# define GENERATOR_FIELD REAL, dimension (NGEN, volh_tot, EVEN:ODD, DIM) +# define SPINCOL_FIELD COMPLEX, dimension (NDIRAC, NCOL, volh_tot) +# define SC2_FIELD COMPLEX, dimension(2, NCOL, volh_tot, DIM, FWD:BWD) +# define CLOVER_FIELD_A type(type_clover_a), dimension(2, volh, EVEN:ODD) +# define CLOVER_FIELD_B type(type_clover_b), dimension(2, volh, EVEN:ODD) +# define CLOVER_FIELD_C COMPLEX, dimension(NDIRAC, NCOL, NDIRAC, NCOL, volh) + +# define P_GAUGE_FIELD COMPLEX, dimension(:, :, :, :, :), pointer +# define P_GAUGE_FIELD_IO COMPLEX, dimension(:, :, :, :, :, :, :), pointer +# define P_GENERATOR_FIELD REAL, dimension(:, :, :, :), pointer +# define P_SPINCOL_FIELD COMPLEX, dimension(:, :, :), pointer +# define P_SPINCOL_FIELD_IO COMPLEX, dimension(:, :, :, :, :, :), pointer +# define P_SC2_FIELD COMPLEX, dimension(:, :, :, :, :), pointer +# define P_CLOVER_FIELD_A type(type_clover_a), dimension(:, :, :), pointer +# define P_CLOVER_FIELD_B type(type_clover_b), dimension(:, :, :), pointer + +# define SPINCOL_OVERINDEXED REAL, dimension(SIZE_COMPLEX*NDIRAC*NCOL*volh_tot) +# define P_SPINCOL_OVERINDEXED REAL, dimension(:), pointer + +# define FILENAME character(len=80) +# define FILENAME_FORMAT character(len=80) + +# define Re(z) real(z) +# define Im(z) aimag(z) + +# define CAT(A, B) A ## B +# define STRCAT(A, B) CAT(A, B) +# define STRCAT3(A, B, C) STRCAT(STRCAT(A, B), C) + +# define PI STRCAT(3.1415926535897931_, RKIND) +# define TWOPI STRCAT(6.2831853071795862_, RKIND) +# define SQRT3 STRCAT(1.7320508075688772_, RKIND) + +# define ZERO STRCAT(0.0_, RKIND) +# define ONE STRCAT(1.0_, RKIND) +# define TWO STRCAT(2.0_, RKIND) +# define THREE STRCAT(3.0_, RKIND) +# define FOUR STRCAT(4.0_, RKIND) +# define SIX STRCAT(6.0_, RKIND) +# define EIGHT STRCAT(8.0_, RKIND) + +# define HALF STRCAT(0.5_, RKIND) +# define EIGHTH STRCAT(0.125_, RKIND) + +# define timing_bin_d_xf 1 +# define timing_bin_d_xb 2 +# define timing_bin_d_yf 3 +# define timing_bin_d_yb 4 +# define timing_bin_d_zf 5 +# define timing_bin_d_zb 6 +# define timing_bin_d_t 7 +# define timing_bin_d 8 +# define timing_bin_mtdagmt 9 +# define timing_bin_global_sum 10 +# define timing_bin_global_sum_vec 11 +# define timing_bin_sc_zero 12 +# define timing_bin_sc_copy 13 +# define timing_bin_sc_scale 14 +# define timing_bin_sc_norm2 15 +# define timing_bin_sc_dot 16 +# define timing_bin_sc_axpy 17 +# define timing_bin_sc_xpby 18 +# define timing_bin_sc_axpby 19 +# define timing_bin_sc_cdotc 20 +# define timing_bin_sc_caxpy 21 +# define timing_bin_sc_caxpy2 22 +# define timing_bin_sc_cax2 23 +# define timing_bin_cg 24 +# define timing_bin_hmc_init_p 25 +# define timing_bin_hmc_u 26 +# define timing_bin_dsg 27 +# define timing_bin_dsf 28 +# define timing_bin_clover_init 29 +# define timing_bin_clover_mult_a 30 +# define timing_bin_clover_mult_ao 31 +# define timing_bin_clover_mult_b 32 +# define timing_bin_clover_dsd 33 +# define timing_bin_clover_dsf 34 +# define timing_bin_hmc 35 +# define timing_bin_plaq 36 +# define timing_bin_cooling 37 +# define timing_bin_u_read 38 +# define timing_bin_u_write 39 +# define timing_bin_total 40 + +# define timing_bin_hmc_init 41 +# define timing_bin_hmc_momenta 42 +# define timing_bin_hmc_init_phi 43 +# define timing_bin_hmc_h_old 44 +# define timing_bin_hmc_backup 45 +# define timing_bin_hmc_half_step0 46 +# define timing_bin_hmc_half_step1 47 +# define timing_bin_hmc_xbound_g 48 +# define timing_bin_hmc_steps 49 +# define timing_bin_hmc_h_new 50 +# define timing_bin_hmc_rest 51 + +# define timing_bin_h_mult_a 52 +# define timing_bin_h_mult_b 53 +# define timing_bin_h_mult_c 54 + +# define timing_bin_sc2_projection 55 + +# define timing_bin_d_dag_xf timing_bin_d_xf +# define timing_bin_d_dag_xb timing_bin_d_xb +# define timing_bin_d_dag_yf timing_bin_d_yf +# define timing_bin_d_dag_yb timing_bin_d_yb +# define timing_bin_d_dag_zf timing_bin_d_zf +# define timing_bin_d_dag_zb timing_bin_d_zb +# define timing_bin_d_dag_t timing_bin_d_t +# define timing_bin_d_dag timing_bin_d + +#ifdef TIMING + +# define TIMING_START(bin) call timing_start(bin) +# define TIMING_STOP(bin) call timing_stop(bin) +# define TIMING_WRITE(unit) call timing_write(unit) + +#else + +# define TIMING_START(bin) +# define TIMING_STOP(bin) +# define TIMING_WRITE(unit) + +#endif + + +# define STDERR 0 +# define UINPUT 1 +# define UCONF 2 +# define URAN 3 +# define UCOUNT 4 +# define UREC 6 +# define UINFO 7 +# define ULIST 8 +# define UDIAG 99 + +# define START_HOT 0 +# define START_COLD 1 +# define START_CONT 2 +# define START_FILE 3 + +# define SWAP_DOWN -1 +# define SWAP_RANDOM 0 +# define SWAP_UP 1 + +# define HMC_TEST_FORWARDS 1 +# define HMC_TEST_NONE 0 +# define HMC_TEST_BACKWARDS -1 + +# define PUTSTR(unit, str) if (my_pe() == 0) write(unit,*) str +# define PUTVAL(unit, val) if (my_pe() == 0) write(unit,*) #val, ": ", val + +# define DIAGSTR(str) write(UDIAG,*) str +# define DIAGVAL(val) write(UDIAG,*) #val, ": ", val + +# define ALLOCATE_G_FIELD(x) if (.not. associated(x)) call allocate_g_field(x) +# define ALLOCATE_G_FIELD_IO(x) if (.not. associated(x)) call allocate_g_field_io(x) +# define ALLOCATE_GEN_FIELD(x) if (.not. associated(x)) call allocate_gen_field(x) +# define ALLOCATE_SC_FIELD(x) if (.not. associated(x)) call allocate_sc_field(x) +# define ALLOCATE_SC_FIELD_IO(x) if (.not. associated(x)) call allocate_sc_field_io(x) +# define ALLOCATE_SC_OVERINDEXED(x) if (.not. associated(x)) call allocate_sc_overindexed(x) +# define ALLOCATE_SC2_FIELD(x) if (.not. associated(x)) call allocate_sc2_field(x) + +# define ASSERT(condition) if (.not. (condition)) call assertion_failed(__FILE__, __LINE__, #condition) + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/dsd.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/dsd.F90 new file mode 100644 index 0000000000000000000000000000000000000000..0a5b9069a3cc78918cbbc1e6307cf0a25f94731c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/dsd.F90 @@ -0,0 +1,40 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! dsd.F90 --- p(j,x,mu) := p(j,x,mu) - step * D_{x,mu,j} S_det +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine dsd(p, conf, step, para) + + use typedef_hmc + use module_hmc_forces + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + GENERATOR_FIELD, intent(inout) :: p + REAL, intent(in) :: step + REAL :: s + + s = -step * TWO * (para%csw_kappa / EIGHT) + + if (s /= ZERO) then + call hmc_forces_old(p) + call clover_dsd(ODD, p, conf%b, s, conf%u) + call hmc_forces_new(p, step, i_sd) + endif + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/dsf.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/dsf.F90 new file mode 100644 index 0000000000000000000000000000000000000000..f06ff31df11deaa345d7c5a8895e3eb1fa9f8378 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/dsf.F90 @@ -0,0 +1,101 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! dsf.F90 - kernel of: p(j,x,mu) := p(j,x,mu) - step * D_{x,mu,j} S_{f 1|2} +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine dsf(p, conf, step, para, a, b) + + use typedef_hmc + use module_nn + use module_p_interface + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + GENERATOR_FIELD, intent(inout) :: p + REAL, intent(in) :: step + SPINCOL_FIELD, intent(in) :: a + SPINCOL_FIELD, intent(in) :: b + + P_GAUGE_FIELD :: u + P_SPINCOL_FIELD, save :: at, bt + REAL :: s, s1, s2 + + + !! call flip_bc(u) <- done in calling routine + + ALLOCATE_SC_FIELD(at) + ALLOCATE_SC_FIELD(bt) + + u => conf%u + + if (para%kappa /= ZERO) then + call d(ODD, EVEN, at, a, u) ! A~ = Doe A + call d_dag(ODD, EVEN, bt, b, u) ! B~ = Deo+ B + + if (para%csw_kappa /= ZERO) then + call clover_mult_b(conf%b(1,1,ODD), at, volh) ! A~ = inv(Too) A~ + call clover_mult_b(conf%b(1,1,ODD), bt, volh) ! B~ = inv(Too) B~ + endif + + if (para%h /= ZERO) then + call h_mult_b(-para%h, at, volh) ! A~ ~ inv(H) A~ + call h_mult_b( para%h, bt, volh) ! B~ ~ inv(H+) B~ + endif + + call xbound_sc_field(a) + call xbound_sc_field(b) + call xbound_sc_field(at) + call xbound_sc_field(bt) + endif + + TIMING_START(timing_bin_dsf) + + s = -step * TWO * para%kappa**2 / (ONE + para%h**2) + + if (s /= ZERO) then + call dsf_xf(p(1,1,EVEN,1), b, at, s, u(1,1,1,EVEN,1), nn(1,EVEN,1,FWD), VOLH) + call dsf_xf(p(1,1,ODD ,1), bt, a, s, u(1,1,1,ODD ,1), nn(1,ODD ,1,FWD), VOLH) + call dsf_xb(p(1,1,EVEN,1), bt, a, s, u(1,1,1,EVEN,1), nn(1,EVEN,1,FWD), VOLH) + call dsf_xb(p(1,1,ODD ,1), b, at, s, u(1,1,1,ODD ,1), nn(1,ODD ,1,FWD), VOLH) + + call dsf_yf(p(1,1,EVEN,2), b, at, s, u(1,1,1,EVEN,2), nn(1,EVEN,2,FWD), VOLH) + call dsf_yf(p(1,1,ODD ,2), bt, a, s, u(1,1,1,ODD ,2), nn(1,ODD ,2,FWD), VOLH) + call dsf_yb(p(1,1,EVEN,2), bt, a, s, u(1,1,1,EVEN,2), nn(1,EVEN,2,FWD), VOLH) + call dsf_yb(p(1,1,ODD ,2), b, at, s, u(1,1,1,ODD ,2), nn(1,ODD ,2,FWD), VOLH) + + call dsf_zf(p(1,1,EVEN,3), b, at, s, u(1,1,1,EVEN,3), nn(1,EVEN,3,FWD), VOLH) + call dsf_zf(p(1,1,ODD ,3), bt, a, s, u(1,1,1,ODD ,3), nn(1,ODD ,3,FWD), VOLH) + call dsf_zb(p(1,1,EVEN,3), bt, a, s, u(1,1,1,EVEN,3), nn(1,EVEN,3,FWD), VOLH) + call dsf_zb(p(1,1,ODD ,3), b, at, s, u(1,1,1,ODD ,3), nn(1,ODD ,3,FWD), VOLH) + + call dsf_tf(p(1,1,EVEN,4), b, at, s, u(1,1,1,EVEN,4), nn(1,EVEN,4,FWD), VOLH) + call dsf_tf(p(1,1,ODD ,4), bt, a, s, u(1,1,1,ODD ,4), nn(1,ODD ,4,FWD), VOLH) + call dsf_tb(p(1,1,EVEN,4), bt, a, s, u(1,1,1,EVEN,4), nn(1,EVEN,4,FWD), VOLH) + call dsf_tb(p(1,1,ODD ,4), b, at, s, u(1,1,1,ODD ,4), nn(1,ODD ,4,FWD), VOLH) + endif + + TIMING_STOP(timing_bin_dsf) + + call flip_bc(u) + + s1 = -step * TWO * (para%csw_kappa / EIGHT) + s2 = -step * TWO * (para%csw_kappa / EIGHT) * para%kappa**2 + + if (s1 /= ZERO) call clover_dsf(EVEN, p, b, a, s1, u) + if (s2 /= ZERO) call clover_dsf(ODD, p, bt, at, s2, u) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/dsf1.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/dsf1.F90 new file mode 100644 index 0000000000000000000000000000000000000000..9796d065359348494c2d70385a5c3f5caa63ae69 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/dsf1.F90 @@ -0,0 +1,66 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! dsf1.F90 -- p(j,x,mu) := p(j,x,mu) - step * D_{x,mu,j} S_{f1} +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine dsf1(p, conf, step, calc_sf, sf, para) + + use typedef_hmc + use module_hmc_forces + use module_mre + use module_p_interface + use module_switches + use module_vol + implicit none + + GENERATOR_FIELD, intent(inout) :: p + type(hmc_conf), intent(in) :: conf + type(hmc_para), intent(in) :: para + integer, intent(in) :: calc_sf + REAL, intent(in) :: step + REAL, intent(out) :: sf + + type(type_mre), save :: solutions + P_SPINCOL_FIELD, save :: a, b + REAL, external :: dotprod + integer :: iterations + external :: w_mult + external :: w_dagger_w + + sf = ZERO + + if (switches%quenched) return + + ALLOCATE_SC_FIELD(a) + ALLOCATE_SC_FIELD(b) + + call flip_bc(conf%u) + + call mre_get(solutions, w_mult, a, conf%phi, para, conf) + call cg(w_dagger_w, a, conf%phi, para, conf, iterations) ! A = inv(W+ W~) Phi + call mre_put(solutions, a, calc_sf) ! calc_sf <=> reset + call w_mult(b, a, para, conf) ! B = W~ A + + if (calc_sf /= 0) sf = dotprod(b, b, SIZE_SC_FIELD) + + call hmc_forces_old(p) + call dsf(p, conf, step, para, a, b) + call hmc_forces_new(p, step, i_sf1) + + !! call flip_bc(conf%u) <- done in dsf() + + call iteration_count_f1(iterations) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/dsf2.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/dsf2.F90 new file mode 100644 index 0000000000000000000000000000000000000000..9cf66fe4195ef2b0b213821c96f32bdfa7fbe211 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/dsf2.F90 @@ -0,0 +1,67 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! dsf2.F90 -- p(j,x,mu) := p(j,x,mu) - step * D_{x,mu,j} S_{f2} +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine dsf2(p, conf, step, calc_sf, sf, para) + + use typedef_hmc + use module_hmc_forces + use module_mre + use module_p_interface + use module_switches + use module_vol + implicit none + + GENERATOR_FIELD, intent(inout) :: p + type(hmc_conf), intent(in) :: conf + type(hmc_para), intent(in) :: para + integer, intent(in) :: calc_sf + REAL, intent(in) :: step + REAL, intent(out) :: sf + + type(type_mre), save :: solutions + P_SPINCOL_FIELD, save :: a, b + REAL, external :: dotprod + integer :: iterations + external :: mtil + external :: mtdagmt + + sf = ZERO + + if (.not. switches%hasenbusch) return + + ALLOCATE_SC_FIELD(a) + ALLOCATE_SC_FIELD(b) + + call flip_bc(conf%u) + + call w_mult_dag(b, conf%phi2, para, conf) ! B = W+ phi2 + call mre_get(solutions, mtil, a, b, para, conf) + call cg(mtdagmt, a, b, para, conf, iterations) ! A = inv(M~+ M~) W+ phi2 + call mre_put(solutions, a, calc_sf) ! calc_sf <=> reset + call mtil(b, a, para, conf) ! B = M~ A + if (calc_sf /= 0) sf = dotprod(b, b, SIZE_SC_FIELD) + call sc_axpy(b, conf%phi2, -ONE) ! B = B - phi2 + + call hmc_forces_old(p) + call dsf(p, conf, step, para, a, b) + call hmc_forces_new(p, step, i_sf2) + + !! call flip_bc(conf%u) <- done in dsf() + + call iteration_count_f2(iterations) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/dsg.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/dsg.F90 new file mode 100644 index 0000000000000000000000000000000000000000..d15f70a21492382124813aa1e00ec6bea32549a5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/dsg.F90 @@ -0,0 +1,54 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! dsg.F90 - p(j,x,mu) := p(j,x,mu) - step * D_{x,mu,j} S_g +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine dsg(p, u, step, beta) + + use module_hmc_forces + use module_vol + implicit none + + GENERATOR_FIELD, intent(inout) :: p + GAUGE_FIELD, intent(in) :: u + REAL, intent(in) :: step, beta + REAL :: s + SU3 :: uuu, w + integer :: mu, eo, i + + if (beta == ZERO) return + + TIMING_START(timing_bin_dsg) + + call hmc_forces_old(p) + + s = -step * beta / THREE + + do mu = 1, DIM + do eo = EVEN, ODD + !$omp parallel do private(uuu, w) + do i = 1, volh + call staple(uuu, u, i, eo, mu) + call uu(w, u(1, 1, i, eo, mu), uuu) + call im_tr_j(p(1, i, eo, mu), w, s) + enddo + enddo + enddo + + call hmc_forces_new(p, step, i_sg) + + TIMING_STOP(timing_bin_dsg) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/files.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/files.F90 new file mode 100644 index 0000000000000000000000000000000000000000..55da217cf65bd496a9389e6916728ca5e3a5c4ab --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/files.F90 @@ -0,0 +1,325 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! files.F90 +! +!------------------------------------------------------------------------------- +! +! restart_file: progname.run.{res|count|ran|stop} +! restart_conf_file: progname.run.s.time.{u|phi} +! info_file: progname.run.s.info +! +! conf_info_file: progname.run.s1.s2.traj.info +! conf_file: progname.run.s1.s2.traj.time.u +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_files ! formats of file name strings + + implicit none + + ! lengths of formats of file name strings + + integer, parameter :: l_name = 2 + integer, parameter :: l_ext = 2 + integer, parameter :: l_sep = 5 + integer, parameter :: l_num = 4 + + integer, parameter :: l_base = l_name + l_sep + l_num + integer, parameter :: l_conf = l_base + 3 * (l_sep + l_num) + + ! formats of file name strings + + character(len = l_name), save :: fmt_name = "(a" + character(len = l_name), save :: fmt_ext = "a)" + character(len = l_sep), save :: fmt_sep = ",'.'," + character(len = l_num), save :: fmt_run = "i3.3" + character(len = l_num), save :: fmt_ensemble = "i1.1" + character(len = l_num), save :: fmt_traj = "i5.5" + character(len = l_num), save :: fmt_time = "i3.3" + + character(len = *), parameter :: ext_count = "count" + character(len = *), parameter :: ext_ran = "ran" + character(len = *), parameter :: ext_res = "res" + character(len = *), parameter :: ext_u = "u" + character(len = *), parameter :: ext_phi = "phi" + character(len = *), parameter :: ext_info = "info" + character(len = *), parameter :: ext_stop = "STOP" + +CONTAINS + + character(len=l_base) function fmt_base() + fmt_base = fmt_name // fmt_sep // fmt_run + end function fmt_base + + character(len=l_conf) function fmt_conf() + fmt_conf = fmt_base() // fmt_sep // fmt_ensemble & + // fmt_sep // fmt_ensemble & + // fmt_sep // fmt_traj + end function fmt_conf + +end + +!------------------------------------------------------------------------------- +FILENAME function count_file() + + use module_files + implicit none + FILENAME :: restart_file + + count_file = restart_file(ext_count) +end + +!------------------------------------------------------------------------------- +FILENAME function ran_file() + + use module_files + implicit none + FILENAME :: restart_file + + ran_file = restart_file(ext_ran) +end + +!------------------------------------------------------------------------------- +FILENAME function res_file() + + use module_files + implicit none + FILENAME :: restart_file + + res_file = restart_file(ext_res) +end + +!------------------------------------------------------------------------------- +FILENAME function stop_file() + + use module_files + implicit none + FILENAME :: restart_file + + stop_file = restart_file(ext_stop) +end + +!------------------------------------------------------------------------------- +FILENAME function u_file(i_ensemble, time) + + use module_files + implicit none + integer, intent(in) :: i_ensemble, time + FILENAME :: restart_conf_file + + u_file = restart_conf_file(i_ensemble, time, ext_u) +end + +!------------------------------------------------------------------------------- +FILENAME function phi_file(i_ensemble, time) + + use module_files + implicit none + integer, intent(in) :: i_ensemble, time + FILENAME :: restart_conf_file + + phi_file = restart_conf_file(i_ensemble, time, ext_phi) +end + + +!------------------------------------------------------------------------------- +FILENAME function restart_file(ext) + + use module_bqcd + use module_counter + use module_files + implicit none + + character(len = *), intent(in) :: ext + FILENAME_FORMAT :: fmt + + fmt = fmt_base() // fmt_sep // fmt_ext + + write(restart_file, fmt) prog_name, counter%run, ext + +end + +!------------------------------------------------------------------------------- +FILENAME function restart_conf_file(i_ensemble, time, ext) + + use module_bqcd + use module_counter + use module_files + implicit none + + integer, intent(in) :: i_ensemble, time + character(len = *), intent(in) :: ext + FILENAME_FORMAT :: fmt + + fmt = fmt_base() // fmt_sep // fmt_ensemble & + // fmt_sep // fmt_time & + // fmt_sep // fmt_ext + + write(restart_conf_file, fmt) prog_name, counter%run, i_ensemble, time, ext + +end + +!------------------------------------------------------------------------------- +FILENAME function info_file(i_ensemble) + + use module_bqcd + use module_counter + use module_files + implicit none + + integer, intent(in) :: i_ensemble + FILENAME_FORMAT :: fmt + + fmt = fmt_base() // fmt_sep // fmt_ensemble // fmt_sep // fmt_ext + + write(info_file, fmt) prog_name, counter%run, i_ensemble, ext_info + +end + +!------------------------------------------------------------------------------- +FILENAME function conf_info_file(i_ensemble1, i_ensemble2) + + use module_bqcd + use module_counter + use module_files + implicit none + + integer, intent(in) :: i_ensemble1, i_ensemble2 + FILENAME_FORMAT :: fmt + + fmt = fmt_conf() // fmt_sep // fmt_ext + + write(conf_info_file, fmt) & + prog_name, counter%run, i_ensemble1, i_ensemble2, counter%traj, ext_info + +end + +!------------------------------------------------------------------------------- +FILENAME function conf_file(i_ensemble1, i_ensemble2, time) + + use module_bqcd + use module_counter + use module_files + implicit none + + integer, intent(in) :: i_ensemble1, i_ensemble2, time + FILENAME_FORMAT :: fmt + + fmt = fmt_conf() // fmt_sep // fmt_time // fmt_sep // fmt_ext + + write(conf_file, fmt) & + prog_name, counter%run, i_ensemble1, i_ensemble2, counter%traj, time, ext_u + +end + +!------------------------------------------------------------------------------- +subroutine check_fmt(run, max_temper, max_traj, max_time) + + use module_files + implicit none + integer :: run, max_temper, max_traj, max_time + + call check_len(run, fmt_run, "RUN") + call check_len(max_temper, fmt_ensemble, "TEMPER") + call check_len(max_traj, fmt_traj, "TRAJ") + call check_len(max_time, fmt_time, "TIME") + +CONTAINS + + subroutine check_len(counter, counter_fmt, counter_name) + + implicit none + integer :: i, len, counter + character(len = *) :: counter_fmt, counter_name + + i = index(counter_fmt, "i") + + if (i == 0) then + call die("check_fmt(): unable to check fmt for " // counter_name) + endif + + read(counter_fmt(i+1:i+1), *) len + + if (counter < 0 .or. counter >= 10**len) then + call die("check_fmt(): file name format unsuitable for " // counter_name) + endif + + end subroutine check_len + +end + +!------------------------------------------------------------------------------- +subroutine set_fmt_ensemble(N_temper) + + use module_files + implicit none + + integer, intent(in) :: N_temper + + if (N_temper < 10) then + fmt_ensemble = "i1.1" + else if (N_temper < 100) then + fmt_ensemble = "i2.2" + else + call die ("set_fmt_ensemble(): N_temper >= 100 ???") + endif + +end + + +!------------------------------------------------------------------------------- +function format_ensemble() + + use module_files + implicit none + character(len = l_num) :: format_ensemble + + format_ensemble = fmt_ensemble +end + +!------------------------------------------------------------------------------- +subroutine filename_test() + + use module_function_decl + implicit none + integer i + + FILENAME, external :: count_file + FILENAME, external :: ran_file + FILENAME, external :: res_file + FILENAME, external :: stop_file + FILENAME, external :: u_file + FILENAME, external :: phi_file + FILENAME, external :: restart_conf_file + FILENAME, external :: info_file + FILENAME, external :: conf_info_file + FILENAME, external :: conf_file + + do i = 1, 11, 10 + call set_fmt_ensemble(i) + + PUTVAL(6, count_file()) + PUTVAL(6, ran_file()) + PUTVAL(6, res_file()) + PUTVAL(6, stop_file()) + PUTVAL(6, u_file(3, 4)) + PUTVAL(6, phi_file(5, 6)) + PUTVAL(6, restart_conf_file(1, 2, 'conf')) + PUTVAL(6, info_file(7)) + PUTVAL(6, conf_info_file(7, 8)) + PUTVAL(6, conf_file(3, 4, 5)) + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/fixLouhiPP.sh b/qcd/part_cpu/applications/QCD/src/kernel_A/fixLouhiPP.sh new file mode 100644 index 0000000000000000000000000000000000000000..b6e1537ba25946b650663060d03a61e95895e258 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/fixLouhiPP.sh @@ -0,0 +1,8 @@ +sed 's/% ##/%/g' modules/module_input.F90 > tmp.dat; cp tmp.dat modules/module_input.F90; rm -f tmp.dat + +sed 's/## )/ )/g' clover/clover_init.F90 > tmp.dat; cp tmp.dat clover/clover_init.F90; rm -f tmp.dat + +sed 's/## (/ (/g' d/D21xyzt.F90 > tmp.dat; cp tmp.dat d/D21xyzt.F90; rm -f tmp.dat +sed 's/## (/ (/g' d/D2xyzt.F90 > tmp.dat; cp tmp.dat d/D2xyzt.F90; rm -f tmp.dat +sed 's/## (/ (/g' d/D31xyzt.F90 > tmp.dat; cp tmp.dat d/D31xyzt.F90; rm -f tmp.dat +sed 's/## (/ (/g' d/D3xyzt.F90 > tmp.dat; cp tmp.dat d/D3xyzt.F90; rm -f tmp.dat diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/flip_bc.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/flip_bc.F90 new file mode 100644 index 0000000000000000000000000000000000000000..f8cb9fbdb940fccfaa155b2712969f83fa5b5b51 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/flip_bc.F90 @@ -0,0 +1,112 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! flip_bc.F90 - flip fermionic boundary conditions +! (ie multiplication of corresponding links with -1) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_flip_bc + + INTEGER, dimension(:, :, :), pointer, save :: flip_bc_list + integer, dimension(DIM), save :: flip_bc_len + +end + +!------------------------------------------------------------------------------- +subroutine init_flip_bc() + + use module_flip_bc + use module_function_decl + use module_lattice + use module_vol + implicit none + + integer, dimension(DIM) :: i0, i1, i_pe, j + integer :: me, mu, x, y, z, t, i, eo, count(EVEN:ODD) + integer, external :: xyzt2i, e_o + + allocate(flip_bc_list(volh_tot, EVEN:ODD, DIM)) + + me = my_pe() + call unlex(me, DIM, i_pe, NPE) + + do mu = 1, DIM + count = 0 + if (bc_fermions(mu) < 0) then + if (i_pe(mu) == (NPE(mu) - 1) .or. i_pe(mu) == 0) then + i0 = 0 + i1 = N - 1 + + if (i_pe(mu) == (NPE(mu) - 1)) then + i0(mu) = N(mu) - 1 + else + i0(mu) = -1 + endif + + i1(mu) = i0(mu) + + do t = i0(4), i1(4) + do z = i0(3), i1(3) + do y = i0(2), i1(2) + do x = i0(1), i1(1) + j = (/x, y, z, t/) + i = xyzt2i(j) + eo = e_o(j) + + count(eo) = count(eo) + 1 + flip_bc_list(count(eo), eo, mu) = i + enddo + enddo + enddo + enddo + + endif + endif + + if (count(EVEN) /= count(ODD)) then + call die ("init_flip_bc(): count(EVEN) /= count(ODD)") + else + flip_bc_len(mu) = count(EVEN) + endif + enddo + +end + +!------------------------------------------------------------------------------- +subroutine flip_bc(u) + + use module_flip_bc + use module_lattice + use module_vol + implicit none + + GAUGE_FIELD, intent(inout) :: u + integer :: mu, nu, count, i, eo, c1, c2 + + do mu = 1, DIM + nu = gamma_index(mu) + do eo = EVEN,ODD + do count = 1, flip_bc_len(mu) + i = flip_bc_list(count, eo, mu) + do c2 = 1, NCOL + do c1 = 1, NCOL + u(c1, c2, i, eo, nu) = -u(c1, c2, i, eo, nu) + enddo + enddo + enddo + enddo + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/h_mult.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/h_mult.F90 new file mode 100644 index 0000000000000000000000000000000000000000..67af646983001c34b430fb7402e5b962216ffedf --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/h_mult.F90 @@ -0,0 +1,107 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2000-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! h_mult.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine h_mult_a(out, h, in, volh) ! out := out + i h gamma_5 in + + implicit none + COMPLEX, dimension (NDIRAC, *) :: out, in + REAL :: h + integer :: volh + + integer :: i + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + TIMING_START(timing_bin_h_mult_a) + + !$omp parallel do + do i = 1, NCOL * volh + out(1, i) = out(1, i) + h * i_times(in(3, i)) + out(2, i) = out(2, i) + h * i_times(in(4, i)) + out(3, i) = out(3, i) + h * i_times(in(1, i)) + out(4, i) = out(4, i) + h * i_times(in(2, i)) + enddo + + TIMING_STOP(timing_bin_h_mult_a) +end + +!------------------------------------------------------------------------------- +subroutine h_mult_b(h, x, volh) ! x := (1 + i h gamma_5) x + + implicit none + REAL :: h + COMPLEX, dimension (NDIRAC, *) :: x + integer :: volh + + integer :: i + COMPLEX :: x1, x2, x3, x4 + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + + TIMING_START(timing_bin_h_mult_b) + + !$omp parallel do private(x1, x2, x3, x4) + do i = 1, NCOL * volh + x1 = x(1, i) + x2 = x(2, i) + x3 = x(3, i) + x4 = x(4, i) + + x(1, i) = x(1, i) + h * i_times(x3) + x(2, i) = x(2, i) + h * i_times(x4) + x(3, i) = x(3, i) + h * i_times(x1) + x(4, i) = x(4, i) + h * i_times(x2) + enddo + + TIMING_STOP(timing_bin_h_mult_b) +end + +!------------------------------------------------------------------------------- +subroutine h_mult_c(out, h, in, volh) ! out = (1 + i h gamma_5) in + + implicit none + COMPLEX, dimension (NDIRAC, *) :: out, in + REAL :: h + integer :: volh + + integer :: i + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + TIMING_START(timing_bin_h_mult_c) + + !$omp parallel do + do i = 1, NCOL * volh + out(1, i) = in(1, i) + h * i_times(in(3, i)) + out(2, i) = in(2, i) + h * i_times(in(4, i)) + out(3, i) = in(3, i) + h * i_times(in(1, i)) + out(4, i) = in(4, i) + h * i_times(in(2, i)) + enddo + + TIMING_STOP(timing_bin_h_mult_c) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc.F90 new file mode 100644 index 0000000000000000000000000000000000000000..ddfed12989cc9eee43d773e8fce66013cbb78bae --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc.F90 @@ -0,0 +1,207 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! hmc.F90 - one Hybrid Monte Carlo step including Hasenbusch's and Bakeyev's +! accelerations +! +!// IR scale: tau +!// IR steps: ntau +!// UV scale: tau / m_scale +!// UV steps: m_scale +!// +!// models / splits of actions: +!// +!// model A: +!// S_UV = S_g +!// S_IR = S_det + S_f1 +!// +!// model B: +!// S_UV = S_g +!// S_IR = S_det + S_f1 + S_f2 +!// +!// model C: +!// S_UV = S_g + S_det + S_f1 +!// S_IR = S_f2 +!// +!// S_f1 = phi1+ inv(W+ W) phi1 +!// S_f2 = phi2+ W inv(M~+ M~) W+ phi2 +!// +!// W = M~ + rho +!// +!// => ir_steps = 1 and rho = 0 corresponds exactly to the previous verions +!// (rho = 0 is treated as S_f2 = 0), +!// and especially tau and ntau have the same meaning as before +!// +!// (In the whole program phi and phi2 are treated asymmetrically. +!// The reason for this is upward compatibility with the mode +!// "standard Wilson fermions + parallel tempering". +!// phi is needed for the tempering decisions.) +!// +!// Flags / switches: +!// +!// force_accept: Force acceptance after, eg, a hot or cold start. +!// test: For testing reversibility by forward/backward integration. +!// If (test /= HMC_TEST_NONE) force_accept has to be .true. +!// If (test == HMC_TEST_BACKWARDS) para%tau has to be reversed. +!// +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine hmc(para, conf, out, force_accept, test) + + use typedef_hmc + use module_function_decl + use module_p_interface + use module_switches + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(inout) :: conf + type(hmc_out), intent(out) :: out + integer, intent(in) :: force_accept + integer, intent(in) :: test + + P_GAUGE_FIELD, save :: u_bck + P_SPINCOL_FIELD, save :: phi_bck + P_CLOVER_FIELD_A, save :: a_bck, i_bck + P_CLOVER_FIELD_B, save :: b_bck + P_GENERATOR_FIELD, save :: p + + REAL :: sd_old, sd_new + REAL :: sf1_old, sf1_new + REAL :: sf2_old, sf2_new + REAL :: sg_old, sg_new + REAL :: sp_old, sp_new + REAL :: hg_old, hg_new + REAL :: h_old, h_new + REAL :: sf1, sf2 + REAL, external :: sp, sg, clover_action + + + TIMING_START(timing_bin_hmc) + + TIMING_START(timing_bin_hmc_init) + + if (.not. associated(u_bck)) then + ALLOCATE_G_FIELD(u_bck) + ALLOCATE_GEN_FIELD(p) + ALLOCATE_SC_FIELD(phi_bck) + if (switches%clover) call allocate_clover_field_a(a_bck) + if (switches%clover) call allocate_clover_field_a(i_bck) + if (switches%clover) call allocate_clover_field_b(b_bck) + endif + + sd_old = ZERO; sd_new = ZERO + sf1_old = ZERO; sf1_new = ZERO + sf2_old = ZERO; sf2_new = ZERO + sg_old = ZERO; sg_new = ZERO + sp_old = ZERO; sp_new = ZERO + sf1 = ZERO; sf2 = ZERO + + call init_cg_stat() + TIMING_STOP(timing_bin_hmc_init) + + if (test /= HMC_TEST_BACKWARDS) then ! ie normally do: + + ! backups: + + TIMING_START(timing_bin_hmc_backup) + + call swap_p_sc_field(phi_bck, conf%phi) + + u_bck = conf%u + if (switches%clover) a_bck = conf%a + if (switches%clover) i_bck = conf%i + if (switches%clover) b_bck = conf%b + + TIMING_STOP(timing_bin_hmc_backup) + + ! initialize momenta p, phi, phi2 and old action: + + call hmc_init_p(p) + call hmc_init_phi(conf, para, sf1_old, sf2_old) + + TIMING_START(timing_bin_hmc_h_old) + if (switches%clover) sd_old = clover_action(conf%b(1,1,ODD)) + sg_old = sg(conf%u) + sp_old = sp(p) + hg_old = sg_old * para%beta + + h_old = sd_old + sp_old + hg_old + sf1_old + sf2_old + TIMING_STOP(timing_bin_hmc_h_old) + endif + + if (test == HMC_TEST_FORWARDS) then + call hmc_test_report(test, p, conf%u, & + sp_old, hg_old, sf1_old, sf2_old, sd_old) + endif + +! leap frog integration: + + call hmc_leap_frog(p, para, conf, sf1, sf2) + +! calculate Hamiltonian: + + TIMING_START(timing_bin_hmc_h_new) + if (switches%clover) sd_new = clover_action(conf%b(1,1,ODD)) + sf1_new = sf1 + sf2_new = sf2 + sg_new = sg(conf%u) + sp_new = sp(p) + hg_new = sg_new * para%beta + + h_new = sd_new + sp_new + hg_new + sf1_new + sf2_new + TIMING_STOP(timing_bin_hmc_h_new) + +! accept new U ? : + + TIMING_START(timing_bin_hmc_rest) + out%exp_dh = exp(h_old - h_new) + + if (force_accept /= 0) then + out%accepted = 1 + else + if (ranf() < out%exp_dh) then + out%accepted = 1 + else + out%accepted = 0 + endif + endif + + if (out%accepted == 1) then + out%sg = sg_new + out%sf = sf1_new + else + call swap_p_sc_field(conf%phi, phi_bck) + call swap_p_g_field(conf%u, u_bck) + call swap_p_clover_field_a(conf%a, a_bck) + call swap_p_clover_field_a(conf%i, i_bck) + call swap_p_clover_field_b(conf%b, b_bck) + endif + + call get_cg_stat(out%cg_ncall, out%cg_niter_max, out%cg_niter_tot) + + call iteration_count_write(UREC) + call hmc_forces_write(UREC) + + if (test == HMC_TEST_BACKWARDS) then + call hmc_test_report(test, p, conf%u, & + sp_new, hg_new,sf1_new, sf2_new, sd_new) + endif + + TIMING_STOP(timing_bin_hmc_rest) + + TIMING_STOP(timing_bin_hmc) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_check.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_check.F90 new file mode 100644 index 0000000000000000000000000000000000000000..b0793399a0058613b5e32f9a498a75306db5bafe --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_check.F90 @@ -0,0 +1,122 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! hmc_check.F90 - check by forward/backward leap frog integration +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine hmc_check(check, p, u, hp, hg, hf1, hf2, hd) + + use module_function_decl + use module_vol + implicit none + + integer, intent(in) :: check + GENERATOR_FIELD, intent(in) :: p + GAUGE_FIELD, intent(in) :: u + REAL, intent(in) :: hp, hg, hf1, hf2, hd + + P_GENERATOR_FIELD, save :: p_start + P_GAUGE_FIELD, save :: u_start + REAL, save :: hp_start + REAL, save :: hg_start + REAL, save :: hf1_start + REAL, save :: hf2_start + REAL, save :: hd_start + + REAL :: diff_p, diff_u + integer :: i, eo, mu, j, c1, c2 + + if (.not. associated(p_start)) then + allocate(p_start(NGEN, volh_tot, EVEN:ODD, DIM)) + allocate(u_start(NCOL, NCOL, volh_tot, EVEN:ODD, DIM)) + endif + + if (check == HMC_CHECK_FORWARDS) then + + p_start = p + u_start = u + hp_start = hp + hg_start = hg + hf1_start = hf1 + hf2_start = hf2 + hd_start = hd + + else if (check == HMC_CHECK_BACKWARDS) then + + diff_p = ZERO + diff_u = ZERO + + do mu = 1, DIM + do eo = EVEN, ODD + do i = 1, volh + do j = 1, NGEN + diff_p = max(diff_p, abs(p_start(j,i,eo,mu) - p(j,i,eo,mu))) + enddo + do c2 = 1, NCOL + do c1 = 1, NCOL + diff_u = max(diff_u, & + abs(relative_change(Re(u_start(c1,c2,i,eo,mu)), & + Re(u(c1,c2,i,eo,mu))))) + diff_u = max(diff_u, & + abs(relative_change(Im(u_start(c1,c2,i,eo,mu)), & + Im(u(c1,c2,i,eo,mu))))) + enddo + enddo + enddo + enddo + enddo + + if (my_pe() == 0) then + call begin(UREC, "HMC-check") + write(UREC, *) + write(UREC,400) "Configuration changes (maximal abs. relative changes):" + write(UREC, *) + write(UREC,410) "Generator field:", diff_p + write(UREC,410) "Gauge field: ", diff_u + write(UREC, *) + write(UREC, *) + write(UREC,400) "Energy changes:" + write(UREC, *) + write(UREC,420) "Energy ", "old value", "rel.change" + write(UREC, *) + write(UREC,430) "H_generator", hp_start, relative_change(hp_start, hp) + write(UREC,430) "H_gauge ", hg_start, relative_change(hg_start, hg) + write(UREC,430) "H_fermion_1", hf1_start, relative_change(hf1_start, hf1) + write(UREC,430) "H_fermion_2", hf2_start, relative_change(hf2_start, hf2) + write(UREC,430) "H_det ", hd_start, relative_change(hd_start, hd) + write(UREC, *) + call end_A(UREC, "HMC-check") + endif + +400 format (1x, a) +410 format (1x, a, e8.1) +420 format (1x, a, a20, a12) +430 format (1x, a, e20.10, e12.1) + + else + call die("hmc_check(): unknown check flag.") + endif + +contains + + REAL function relative_change(old, new) + + implicit none + REAL, intent(in) :: old, new + + relative_change = old / (new - old) + + end function relative_change + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_forces.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_forces.F90 new file mode 100644 index 0000000000000000000000000000000000000000..4f7e3985ee39a367b3299c23e03474b41bbfc710 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_forces.F90 @@ -0,0 +1,155 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! hmc_forces.F90 - calculation of HMC forces in Hasenbusch improvement, +! does not work with tempering +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!----------------------------------------------------------------------------- +subroutine hmc_forces_old(p) + + use module_hmc_forces + use module_p_interface + use module_switches + use module_vol + implicit none + + GENERATOR_FIELD, intent(in) :: p + integer :: mu, eo, i + + if (.not. switches%hasenbusch) return + + if (.not. associated(p_old)) then + call allocate_gen_field(p_old) + f_count = ZERO + f_avg = ZERO + f_max = ZERO + endif + + + do mu = 1, DIM + do eo = EVEN, ODD + !$omp parallel do + do i = 1, volh + p_old(1, i, eo, mu) = p(1, i, eo, mu) + p_old(2, i, eo, mu) = p(2, i, eo, mu) + p_old(3, i, eo, mu) = p(3, i, eo, mu) + p_old(4, i, eo, mu) = p(4, i, eo, mu) + p_old(5, i, eo, mu) = p(5, i, eo, mu) + p_old(6, i, eo, mu) = p(6, i, eo, mu) + p_old(7, i, eo, mu) = p(7, i, eo, mu) + p_old(8, i, eo, mu) = p(8, i, eo, mu) + enddo + enddo + enddo + +end + +!----------------------------------------------------------------------------- +subroutine hmc_forces_new(p, step, which) + + use module_hmc_forces + use module_function_decl + use module_switches + use module_vol + implicit none + + GENERATOR_FIELD, intent(in) :: p + REAL, intent(in) :: step + integer, intent(in) :: which + + integer :: mu, eo, i + REAL :: force + + if (.not. switches%hasenbusch) return + + force = ZERO + do mu = 1, DIM + do eo = EVEN, ODD + !$omp parallel do + do i = 1, volh + force = force & + + (p_old(1, i, eo, mu) - p(1, i, eo, mu))**2 & + + (p_old(2, i, eo, mu) - p(2, i, eo, mu))**2 & + + (p_old(3, i, eo, mu) - p(3, i, eo, mu))**2 & + + (p_old(4, i, eo, mu) - p(4, i, eo, mu))**2 & + + (p_old(5, i, eo, mu) - p(5, i, eo, mu))**2 & + + (p_old(6, i, eo, mu) - p(6, i, eo, mu))**2 & + + (p_old(7, i, eo, mu) - p(7, i, eo, mu))**2 & + + (p_old(8, i, eo, mu) - p(8, i, eo, mu))**2 + enddo + enddo + enddo + + force = global_sum(force) / (NGEN * volume * DIM) + force = sqrt(force) / abs(step) + + f_count(which) = f_count(which) + ONE + f_avg(which) = f_avg(which) + force + f_max(which) = max(f_max(which), force) + +end + + +!----------------------------------------------------------------------------- +subroutine hmc_forces_write(unit) + + use module_hmc_forces + use module_counter + use module_function_decl + use module_switches + + implicit none + + integer, intent(in) :: unit + integer, save :: written = 0 + integer :: i + + character(*), parameter :: key_avg = "%Favg" + character(*), parameter :: key_max = "%Fmax" + character(*), parameter :: fmt_h = "(1x, 2a, a6, 4a)" + character(*), parameter :: fmt_b = "(1x, a6, i6, 4g20.10)" + + character(20), dimension(n_force) :: f_name + + if (.not. switches%hasenbusch) return + + f_name(i_sg) = " F_gauge" + f_name(i_sd) = " F_det" + f_name(i_sf1) = " F_F1" + f_name(i_sf2) = " F_F2" + + do i = 1, n_force + if (f_count(i) /= ZERO) then + f_avg(i) = f_avg(i) / f_count(i) + endif + enddo + + if (written == 0 .and. my_pe() == 0) then + write(unit, fmt_h) "T", key_avg, "traj", (f_name(i), i = 1, n_force) + write(unit, fmt_h) "T", key_max, "traj", (f_name(i), i = 1, n_force) + endif + + if (my_pe() == 0) then + write(unit, fmt_b) key_avg, counter%traj, (f_avg(i), i = 1, n_force) + write(unit, fmt_b) key_max, counter%traj, (f_max(i), i = 1, n_force) + endif + + + written = written + 1 + f_count = ZERO + f_avg = ZERO + f_max = ZERO + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_init_p.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_init_p.F90 new file mode 100644 index 0000000000000000000000000000000000000000..4d40dbf9d606cd8849f4b112daa0467526ae3252 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_init_p.F90 @@ -0,0 +1,36 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! hmc_init_p.F90 - initialization of momenta +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine hmc_init_p(p) + + use module_vol + implicit none + + GENERATOR_FIELD, intent(out) :: p + integer :: mu, eo + + TIMING_START(timing_bin_hmc_init_p) + + do mu = 1, DIM + do eo = EVEN, ODD + call ran_gauss_volh(NGEN/2, p(1,1,eo,mu), ONE, eo) + enddo + enddo + + TIMING_STOP(timing_bin_hmc_init_p) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_init_phi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_init_phi.F90 new file mode 100644 index 0000000000000000000000000000000000000000..3e42f7ff0d01f2b7ee365e2b987dcfd256e45996 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_init_phi.F90 @@ -0,0 +1,66 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! hmc_init_phi.F90 - initialises phi, phi2 and calculates actions +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine hmc_init_phi(conf, para, sf1, sf2) + + use typedef_hmc + use module_function_decl + use module_p_interface + use module_switches + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(inout) :: conf + REAL, intent(out) :: sf1 + REAL, intent(out) :: sf2 + + P_SPINCOL_FIELD, save :: tmp + + integer :: iterations + external :: w_dagger_w + + TIMING_START(timing_bin_hmc_init_phi) + + sf1 = ZERO + sf2 = ZERO + + if (.not. switches%dynamical) return + + ALLOCATE_SC_FIELD(tmp) + + call flip_bc(conf%u) + + + call ran_gauss_volh(NDIRAC * NCOL, tmp, HALF, EVEN) ! tmp = noise + sf1 = dotprod(tmp, tmp, SIZE_SC_FIELD) + call w_mult_dag(conf%phi, tmp, para, conf) ! phi = W+ noise + + if (switches%hasenbusch) then + call ran_gauss_volh(NDIRAC * NCOL, tmp, HALF, EVEN) ! tmp = noise + sf2 = dotprod(tmp, tmp, SIZE_SC_FIELD) + call mtil_dag(conf%phi2, tmp, para, conf) ! phi2 = M~+ noise + call cg(w_dagger_w, tmp, conf%phi2, para, conf, iterations) + ! tmp = inv(W+ W) M~+ noise + call w_mult(conf%phi2, tmp, para, conf) ! phi2 = inv(W+) M~+ noise + endif + + call flip_bc(conf%u) + + TIMING_STOP(timing_bin_hmc_init_phi) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_integrator.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_integrator.F90 new file mode 100644 index 0000000000000000000000000000000000000000..609a126116e2d27014680aeecc8e8f97eec5fd7b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_integrator.F90 @@ -0,0 +1,94 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! hmc_integrator.F90 - integrators for the different models +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine hmc_integrator_p_ir(p, para, conf, step, calc_sf, sf1, sf2) + + use typedef_hmc + use module_switches + use module_vol + implicit none + + GENERATOR_FIELD, intent(inout) :: p + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + REAL, intent(in) :: step + integer, intent(in) :: calc_sf + REAL, intent(out) :: sf1, sf2 + + select case (para%model) + case ("A") + call dsf1(p, conf, step, calc_sf, sf1, para) + call dsd(p, conf, step, para) + case ("B") + call dsf1(p, conf, step, calc_sf, sf1, para) + call dsf2(p, conf, step, calc_sf, sf2, para) + call dsd(p, conf, step, para) + case ("C") + call dsf2(p, conf, step, calc_sf, sf2, para) + case default + call die("hmc_integrator_p_ir: " // para%model // ": unknown model") + end select + +end + +!------------------------------------------------------------------------------- +subroutine hmc_integrator_p_uv(p, para, conf, step, calc_sf, sf1, sf2) + + use typedef_hmc + use module_switches + use module_vol + implicit none + + GENERATOR_FIELD, intent(inout) :: p + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + REAL, intent(in) :: step + integer, intent(in) :: calc_sf + REAL, intent(out) :: sf1, sf2 + + select case (para%model) + case ("A") + call dsg(p, conf%u, step, para%beta) + case ("B") + call dsg(p, conf%u, step, para%beta) + case ("C") + call dsg(p, conf%u, step, para%beta) + call dsf1(p, conf, step, calc_sf, sf1, para) + call dsd(p, conf, step, para) + case default + call die("hmc_integrator_p_uv: " // para%model // ": unknown model") + end select + +end + +!------------------------------------------------------------------------------- +subroutine hmc_integrator_q(p, para, conf, step) + + use typedef_hmc + use module_switches + use module_vol + implicit none + + GENERATOR_FIELD, intent(in) :: p + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(inout) :: conf + REAL, intent(in) :: step + + call hmc_u(p, conf, step, para) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_leap_frog.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_leap_frog.F90 new file mode 100644 index 0000000000000000000000000000000000000000..246e5b31d409006998ccb2f5b3e252de02c97a5e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_leap_frog.F90 @@ -0,0 +1,70 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! hmc_leap_frog.F90 - two time scale leap frog integration +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine hmc_leap_frog(p, para, conf, sf1, sf2) + + use typedef_hmc + use module_switches + use module_vol + implicit none + + GENERATOR_FIELD, intent(inout) :: p + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(inout) :: conf + REAL, intent(out) :: sf1, sf2 + + REAL :: step_ir, step_uv + integer :: calc_sf, itau, i_scale + + calc_sf = 0 + +! first half ir and uv step + + step_ir = HALF * para%tau + step_uv = HALF * para%tau / para%m_scale + + TIMING_START(timing_bin_hmc_half_step0) + call hmc_integrator_p_ir(p, para, conf, step_ir, calc_sf, sf1, sf2) + call hmc_integrator_p_uv(p, para, conf, step_uv, calc_sf, sf1, sf2) + TIMING_STOP(timing_bin_hmc_half_step0) + + step_ir = para%tau + step_uv = para%tau / para%m_scale + + do itau = 1, para%ntau + do i_scale = 1, para%m_scale + + call hmc_integrator_q(p, para, conf, step_uv) + + if (itau == para%ntau .and. i_scale == para%m_scale) then + step_ir = step_ir * HALF ! final half steps + step_uv = step_uv * HALF + calc_sf = 1 ! calculate new S_f + endif + + TIMING_START(timing_bin_hmc_steps) + call hmc_integrator_p_uv(p, para, conf, step_uv, calc_sf, sf1, sf2) + TIMING_STOP(timing_bin_hmc_steps) + enddo + + TIMING_START(timing_bin_hmc_steps) + call hmc_integrator_p_ir(p, para, conf, step_ir, calc_sf, sf1, sf2) + TIMING_STOP(timing_bin_hmc_steps) + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_test.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_test.F90 new file mode 100644 index 0000000000000000000000000000000000000000..c7233bf8d5cf916c16b903035804d0d50d6ce740 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_test.F90 @@ -0,0 +1,177 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! hmc_test.F90 - forward/backward leap frog integration +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine hmc_test(para, conf) + + use typedef_hmc + use module_function_decl + use module_vol + implicit none + + type(hmc_para), intent(inout) :: para + type(hmc_conf), intent(inout) :: conf + type(hmc_out) :: out + + call begin(UREC, "HMCtest") + + call hmc(para, conf, out, .true., HMC_TEST_FORWARDS) + + call write_out("forward ") + + para%tau = -para%tau + + call hmc(para, conf, out, .true., HMC_TEST_BACKWARDS) + + para%tau = -para%tau + + call write_out("backward ") + + call end_A(UREC, "HMCtest") + + +contains + + subroutine write_out(direction) + + character(*) :: direction + REAL :: plaq + + if (my_pe() == 0) then + plaq = out%sg / (6 * volume) + write(UREC, *) + write(UREC, 400) "Direction", "PlaqEnergy", & + "exp(-Delta_H)", "CGcalls", "CGitTot", "CGitMax" + write(UREC, 410) direction, plaq, out%exp_dh, & + out%cg_ncall, out%cg_niter_tot, out%cg_niter_max + write(UREC, *) + endif + +400 format (1x, a, 2a15, 3a8) +410 format (1x, a, 2f15.10, 3i8) + + end subroutine write_out + +end + +!------------------------------------------------------------------------------- +subroutine hmc_test_report(test, p, u, hp, hg, hf1, hf2, hd) + + use module_function_decl + use module_vol + implicit none + + integer, intent(in) :: test + GENERATOR_FIELD, intent(in) :: p + GAUGE_FIELD, intent(in) :: u + REAL, intent(in) :: hp, hg, hf1, hf2, hd + + P_GENERATOR_FIELD, save :: p_start + P_GAUGE_FIELD, save :: u_start + REAL, save :: hp_start + REAL, save :: hg_start + REAL, save :: hf1_start + REAL, save :: hf2_start + REAL, save :: hd_start + + REAL :: diff_p, diff_u + integer :: i, eo, mu, j, c1, c2 + + if (.not. associated(p_start)) then + allocate(p_start(NGEN, volh_tot, EVEN:ODD, DIM)) + allocate(u_start(NCOL, NCOL, volh_tot, EVEN:ODD, DIM)) + endif + + if (test == HMC_TEST_FORWARDS) then + + p_start = p + u_start = u + hp_start = hp + hg_start = hg + hf1_start = hf1 + hf2_start = hf2 + hd_start = hd + + else if (test == HMC_TEST_BACKWARDS) then + + diff_p = ZERO + diff_u = ZERO + + do mu = 1, DIM + do eo = EVEN, ODD + do i = 1, volh + do j = 1, NGEN + diff_p = max(diff_p, abs(p_start(j,i,eo,mu) - p(j,i,eo,mu))) + enddo + do c2 = 1, NCOL + do c1 = 1, NCOL + diff_u = max(diff_u, & + abs(relative_change(Re(u_start(c1,c2,i,eo,mu)), & + Re(u(c1,c2,i,eo,mu))))) + diff_u = max(diff_u, & + abs(relative_change(Im(u_start(c1,c2,i,eo,mu)), & + Im(u(c1,c2,i,eo,mu))))) + enddo + enddo + enddo + enddo + enddo + + if (my_pe() == 0) then + write(UREC, *) + write(UREC,400) "Configuration changes (maximal abs. relative changes):" + write(UREC, *) + write(UREC,410) "Generator field:", diff_p + write(UREC,410) "Gauge field: ", diff_u + write(UREC, *) + write(UREC, *) + write(UREC,400) "Energy changes:" + write(UREC, *) + write(UREC,420) "Energy ", "old value", "rel.change" + write(UREC, *) + write(UREC,430) "H_generator", hp_start, relative_change(hp_start, hp) + write(UREC,430) "H_gauge ", hg_start, relative_change(hg_start, hg) + write(UREC,430) "H_fermion_1", hf1_start, relative_change(hf1_start, hf1) + write(UREC,430) "H_fermion_2", hf2_start, relative_change(hf2_start, hf2) + write(UREC,430) "H_det ", hd_start, relative_change(hd_start, hd) + write(UREC, *) + endif + +400 format (1x, a) +410 format (1x, a, e8.1) +420 format (1x, a, a20, a12) +430 format (1x, a, e20.10, e12.1) + + else + call die("hmc_test_report(): illegal test flag.") + endif + +contains + + REAL function relative_change(old, new) + + implicit none + REAL, intent(in) :: old, new + + if (old == ZERO) then + relative_change = ZERO + else + relative_change = (new - old) / old + endif + + end function relative_change + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_u.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_u.F90 new file mode 100644 index 0000000000000000000000000000000000000000..fa3ad5724f480953ad4c36b0f830178a80951b32 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_u.F90 @@ -0,0 +1,60 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! hmc_u.F90 - U := exp(i * lambda_j * P_j * step) * U +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine hmc_u(p, conf, step, para) + + use typedef_hmc + use module_switches + use module_vol + implicit none + + GENERATOR_FIELD, intent(in) :: p + type(hmc_conf), intent(inout) :: conf + REAL, intent(in) :: step + type(hmc_para), intent(in) :: para + + GENERATOR :: q + SU3 :: v + integer :: i, mu, eo, j + + TIMING_START(timing_bin_hmc_u) + + do mu = 1, DIM + do eo = EVEN, ODD + !$omp parallel do private(j, q, v) + do i = 1, VOLH + do j = 1, NGEN + q(j) = p(j, i, eo, mu) * step + enddo + call gen2u(v, q) + call u_update(conf%u(1, 1, i, eo, mu), v) ! u = v * u + call u_normalize(conf%u(1, 1, i, eo, mu)) + enddo + enddo + enddo + + TIMING_STOP(timing_bin_hmc_u) + + TIMING_START(timing_bin_hmc_xbound_g) + call xbound_g_field(conf%u) + TIMING_STOP(timing_bin_hmc_xbound_g) + + if (switches%clover) then + call clover_init(conf%a, conf%i, conf%b, conf%u, para%csw_kappa) + endif +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/index.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/index.F90 new file mode 100644 index 0000000000000000000000000000000000000000..57a35648262b1bd93ee1b786b7c6680e071f10aa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/index.F90 @@ -0,0 +1,144 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! index.F90 - functions for index calculations +! these functions work "stand alone" +! +!------------------------------------------------------------------------------- + integer function i_e_o(dim, i) ! returns EVEN or ODD (0 or 1) + + implicit none + integer dim, i(dim), d + + i_e_o = 0 + do d = 1, dim + i_e_o = i_e_o + i(d) + enddo + + i_e_o = mod(abs(i_e_o), 2) + end + +!------------------------------------------------------------------------------- + integer function i_global(i_local, N_local, i_pe) + + implicit none + integer i_local, N_local, i_pe + + i_global = i_pe * N_local + i_local + + end + +!------------------------------------------------------------------------------- + integer function i_local(i_global, N_local, i_pe) + + implicit none + integer i_global, N_local, i_pe + + i_local = i_global - i_pe * N_local + + end + +!------------------------------------------------------------------------------- + integer function i_periodic(i, n) + + implicit none + integer i, n + + if (i .ge. 0) then + if (i .lt. n) then + i_periodic = i + elseif (i .lt. 2 * n) then + i_periodic = i - n + else + i_periodic = mod(i, n) + endif + else + if (i .ge. -n) then + i_periodic = i + n + else + i_periodic = i + (1 - (i + 1) / n) * n + endif + endif + end + +!------------------------------------------------------------------------------- + integer function ieo(dim, i, n) + + implicit none + integer dim, i(dim), n(dim), d, ilex + + ieo = ilex(dim, i, n) / 2 + end + +!------------------------------------------------------------------------------- + integer function ilex(dim, i, n) + + implicit none + integer dim, i(dim), n(dim), d + + ilex = i(dim) + do d = dim - 1, 1, -1 + ilex = ilex * n(d) + i(d) + enddo + + end + +!------------------------------------------------------------------------------- + integer function n_sites(dim, direction, n, npe) + +! returns number of sites of local grid and boundaries +! n_sites(dim, (/0, 0, ..., 0/), n, npe) is the (local) grid volume + + implicit none + integer dim, direction(dim), n(dim), npe(dim), d + + n_sites = 1 + do d = 1, dim + if (direction(d) .eq. 0) then + n_sites = n_sites * n(d) + else + if (npe(d) .eq. 1) then ! grid not partitioned in d-direction + n_sites = 0 + return + endif + endif + enddo + + end + +!------------------------------------------------------------------------------- +!! subroutine uneo(ieo, eo, dim, i, n) ! returns i for given (ieo, eo) +!! +!! implicit none +!! integer ieo, eo, dim, i(dim), n(dim), e_o +!! +!! call unlex(2 * ieo, dim, i, n) +!! i(1) = i(1) + ieor(e_o(dim-1, i(2)), eo) +!! +!! end +!! +!------------------------------------------------------------------------------- + subroutine unlex(ilex, dim, i, n) + + ! remember the range of ilex: 0 <= ilex < (n(1) * ... * n(dim)) + + implicit none + integer ilex, dim, i(dim), n(dim), j, d + + j = ilex + do d = 1, dim - 1 + i(d) = mod(j, n(d)) + j = j / n(d) ! integer division + enddo + i(dim) = j + + end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/index2.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/index2.F90 new file mode 100644 index 0000000000000000000000000000000000000000..1f1c3aa3dfabd03c13c5f921514cfe17b35cb6c2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/index2.F90 @@ -0,0 +1,55 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! index2.F90 - more functions for index calculations +! these functions use modules +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine local2global(home, local, global) + + use module_lattice + implicit none + + integer, intent(in) :: home ! home process of "local" + integer, intent(in), dimension(DIM) :: local ! local coordinates + integer, intent(out),dimension(DIM) :: global ! global coordinates + + integer, external :: i_global, i_periodic + integer :: coord_home(DIM), i + + + call unlex(home, DIM, coord_home, npe) + + do i = 1, DIM + global(i) = i_global(local(i), N(i), coord_home(i)) + global(i) = i_periodic(global(i), L(i)) + enddo + +end + +!------------------------------------------------------------------------------- +integer function e_o(local) !// returns EVEN or ODD (0 or 1) + + use module_function_decl + implicit none + + integer, intent(in), dimension(DIM) :: local ! local coordinates + integer, dimension(DIM) :: global ! global coordinates + integer, external :: i_e_o + + call local2global(my_pe(), local, global) + + e_o = i_e_o(DIM, global) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/init_common.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/init_common.F90 new file mode 100644 index 0000000000000000000000000000000000000000..61a6cf938f34632a4e0c85981dc320e616ec3f0a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/init_common.F90 @@ -0,0 +1,396 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! init_common.F90 - initialize common blocks (now: mostly modules) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine init_common(para) + + use typedef_para + implicit none + type(type_para), intent(in) :: para + + call init_common_lattice(para%L, para%NPE, para%bc_fermions, para%gamma_index) + call init_common_vol(para%L, para%NPE) + call init_common_nnpe(para%NPE) + call init_common_offset() + call init_common_nn() + call init_common_thread() +end + +!------------------------------------------------------------------------------- +subroutine init_common_lattice(ll, pe, bc_f, gamma_i) + + use module_function_decl + use module_lattice + implicit none + + integer, dimension(DIM), intent(in) :: ll, pe, bc_f, gamma_i + + integer :: i, count(DIM) + + L = ll + NPE = pe + N = ll / pe + NH = N + + NH(1) = N(1) / 2 + + bc_fermions = bc_f + gamma_index = gamma_i + + if (mod(L(1), 2) /= 0) call die("init_common_lattice(): LX must be even") + if (mod(L(2), 2) /= 0) call die("init_common_lattice(): LY must be even") + if (mod(L(3), 2) /= 0) call die("init_common_lattice(): LZ must be even") + if (mod(L(4), 2) /= 0) call die("init_common_lattice(): LT must be even") + + if (mod(N(1), 2) /= 0) call die("init_common_lattice(): NX must be even") + + if (NPE(1) * NPE(2) * NPE(3) * NPE(4) /= num_pes()) then + call die("init_common_lattice(): N_PEs wrong") + endif + +if (N(1)*NPE(1)/=L(1)) call die("init_common_lattice(): NPEX not divider of LX") +if (N(2)*NPE(2)/=L(2)) call die("init_common_lattice(): NPEY not divider of LY") +if (N(3)*NPE(3)/=L(3)) call die("init_common_lattice(): NPEZ not divider of LZ") +if (N(4)*NPE(4)/=L(4)) call die("init_common_lattice(): NPET not divider of LT") + + count = 0 + do i = 1, DIM + if (gamma_index(i) < 1 .or. gamma_index(i) > DIM) then + call die("init_common_lattice(): gamma_index: out of range") + endif + count(gamma_index(i)) = count(gamma_index(i)) + 1 + enddo + + do i = 1, DIM + if (count(i) /= 1) then + call die("init_common_lattice(): gamma_index: inconsistent") + endif + enddo + + select case (version_of_d()) + case(3,4,22) + do i = 1, DIM + if (gamma_index(i) /= i) then + call die( & + "init_common_lattice(): gamma_index: not changeable for this version of D()") + endif + enddo + end select + + do i = 1, DIM + decomp_direction(gamma_index(i)) = i + enddo + +end + +!------------------------------------------------------------------------------- +subroutine init_common_nn() + + use module_function_decl + use module_lattice + use module_vol + use module_nn + implicit none + + integer :: x, y, z, t, i, eo, mu, fb, dir, tmp + integer, dimension (DIM) :: j, start, end + integer, external :: e_o, xyzt2i, i_periodic + integer, parameter :: out_of_range = 2000000000 + + allocate(nn(volh_tot, EVEN:ODD, DIM, FWD:BWD)) + + do fb = FWD, BWD + do mu = 1, DIM + do eo = EVEN, ODD + !$omp parallel do + do i = 1, volh_tot + nn(i, eo, mu, fb) = out_of_range + enddo + enddo + enddo + enddo + + do mu = 1, DIM + if (NPE(mu) == 1) then + start(mu) = 0 + end(mu) = N(mu) - 1 + else + start(mu) = -1 + end(mu) = N(mu) + endif + enddo + + do t = start(4), end(4) + do z = start(3), end(3) + do y = start(2), end(2) + do x = start(1), end(1) + + j = (/x,y,z,t/) + + i = xyzt2i(j) + eo = e_o(j) + + do fb = FWD, BWD + if (fb == FWD) then + dir = +1 + else + dir = -1 + endif + do mu = 1, DIM + j = (/x, y, z, t/) + j(mu) = j(mu) + dir + + if (NPE(mu) == 1) j(mu) = i_periodic(j(mu), L(mu)) + + if (j(mu) < -1 .or. j(mu) > N(mu)) then + nn(i, eo, gamma_index(mu), fb) = out_of_range + else + nn(i, eo, gamma_index(mu), fb) = xyzt2i(j) + endif + enddo + enddo + + enddo + enddo + enddo + enddo + + do fb = FWD, BWD + do mu = 1, DIM + do eo = EVEN, ODD + do i = 1, volh + tmp = nn(i, eo, mu, fb) + if (tmp < 1 .or. tmp > volh_tot) call die("init_common_nn(): error1") + if (num_pes() == 1 .and. tmp > volh) call die("init_common_nn(): error2") + enddo + enddo + enddo + enddo + + do fb = FWD, BWD + do mu = 1, DIM + do eo = EVEN, ODD + do i = 1, volh + tmp = nn(i, eo, mu, fb) + if (nn(tmp, EVEN + ODD - eo, mu, FWD + BWD - fb) /= i) & + call die("init_common_nn(): error3") + enddo + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine init_common_nnpe(NPE) + + use module_function_decl + use module_nnpe + implicit none + + integer, dimension (DIM) :: NPE, i, j, start, end + integer, external :: i_periodic, ilex + integer :: me, pe, x, y, z, t, mu + + me = my_pe() + nnpe(:, :, :, :) = me + + call unlex(me, DIM, i, NPE) + + do mu = 1, DIM + if (NPE(mu) == 1) then + start(mu) = 0 + end(mu) = 0 + else + start(mu) = -1 + end(mu) = 1 + endif + enddo + + do t = start(4), end(4) + do z = start(3), end(3) + do y = start(2), end(2) + do x = start(1), end(1) + + j(1) = i_periodic(i(1) + x, NPE(1)) + j(2) = i_periodic(i(2) + y, NPE(2)) + j(3) = i_periodic(i(3) + z, NPE(3)) + j(4) = i_periodic(i(4) + t, NPE(4)) + + pe = ilex(DIM, j, NPE) + + if (pe < 0 .or. pe >= num_pes()) then + call die ("init_common_pe(): pe out of range") + endif + + nnpe(x, y, z, t) = pe + + enddo + enddo + enddo + enddo + + ASSERT(nnpe(0,0,0,0) == my_pe()) + +end + +!------------------------------------------------------------------------------- +subroutine init_common_offset() + + use module_lattice + use module_vol + use module_offset + implicit none + + integer, external :: n_sites + integer :: x, y, z, t, off, off2, mu + integer :: start(DIM), end(DIM) + + + !!ASSERT(n_sites(DIM, (/0,0,0,0/), N, NPE) == vol) + !!ASSERT(n_sites(DIM, (/0,0,0,0/), NH, NPE) == volh) + + do mu = 1, DIM + if (NPE(mu) == 1) then + start(mu) = 0 + end(mu) = 0 + else + start(mu) = -1 + end(mu) = 1 + endif + enddo + + off = n_sites(DIM,(/0,0,0,0/), NH, NPE) ! volh + + do t = start(4), end(4) + do z = start(3), end(3) + do y = start(2), end(2) + do x = start(1), end(1) + + if (x == 0 .and. y == 0 .and. z == 0 .and. t == 0) then + offset(x,y,z,t) = 0 + else + offset(x,y,z,t) = off + off = off + n_sites(DIM, (/x,y,z,t/), NH, NPE) + endif + + enddo + enddo + enddo + enddo + + off2 = 1 + do mu = 1, DIM + if (NPE(mu) == 1) then + off2 = off2 * NH(mu) + else + off2 = off2 * (NH(mu) + 2) + endif + enddo + + ASSERT(off == off2) + ASSERT(off <= volh_tot) + +end + +!------------------------------------------------------------------------------- +subroutine init_common_vol(L, NPE) + + use module_vol + implicit none + + integer, dimension(DIM), intent(in) :: L, NPE + integer, dimension(DIM) :: N + + N = L / NPE + + volume = L(1) * L(2) * L(3) * L(4) + vol = N(1) * N(2) * N(3) * N(4) + volh = vol / 2 + volh_tot = (N(1)/2 + 2) * (N(2) + 2) * (N(3) + 2) * (N(4) + 2) + + size_sc_field = SIZE_COMPLEX * NDIRAC * NCOL * volh + +end + +!------------------------------------------------------------------------------- +subroutine init_common_thread() + + use module_function_decl + use module_thread + use module_vol + implicit none + + integer :: i, size + +#ifdef _OPENMP + integer :: omp_get_max_threads + n_thread = omp_get_max_threads() +#else + n_thread = 1 +#endif + + if (version_of_d() /= 21 .and. & + version_of_d() /= 22 .and. & + version_of_d() > 2) then + if (n_thread < 2) then + call die("init_common_thread(): " // & + "need at least 2 threads for compiled version of D()") + endif + endif + + allocate(xyz_start(0:n_thread-1)) + allocate(xyz_end(0:n_thread-1)) + allocate(t_start(0:n_thread-1)) + allocate(t_end(0:n_thread-1)) + + if (n_thread == 1) then + xyz_start(0) = 1 + xyz_end(0) = volh + t_start(0) = 1 + t_end(0) = volh + else + xyz_start(0) = 0 + xyz_end(0) = -1 + call init_common_thread_split(n_thread - 1, volh, xyz_start(1), xyz_end(1)) + call init_common_thread_split(n_thread, volh, t_start, t_end) + endif +end + +!------------------------------------------------------------------------------- +subroutine init_common_thread_split(n, size, start, end) + + implicit none + integer, intent(in) :: n, size + integer, intent(out) :: start(n), end(n) + integer :: chunk, rest, i + + chunk = size / n + rest = size - (chunk * n) + + start(1) = 1 + end(1) = chunk + if (rest >= 1) end(1) = end(1) + 1 + + do i = 2, n + start(i) = end(i - 1) + 1 + end(i) = end(i - 1) + chunk + if (rest >= i) end(i) = end(i) + 1 + enddo + + ASSERT(end(n) == size) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/init_modules.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/init_modules.F90 new file mode 100644 index 0000000000000000000000000000000000000000..ba4de3f936678abd83f1217655a1b81ed40c2f1e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/init_modules.F90 @@ -0,0 +1,119 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! init_modules.F90 - initialise (some) modules +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine init_modules() + + call init_module_decomp() + call init_module_lattice_io() + call init_module_sc_size() +end + +!------------------------------------------------------------------------------- +subroutine init_module_decomp() + + use module_decomp + use module_function_decl + use module_lattice + use module_vol + implicit none + + integer :: i, j, me, i_pe(DIM) + integer :: x, y, z, t, eo + integer :: x_std(DIM), i_std + integer :: x_act(DIM), i_act + integer :: me_act, me_std + + integer, external :: e_o + integer, external :: ieo + + + allocate(decomp%std%i(volh, EVEN:ODD)) + allocate(decomp%act%i(volh, EVEN:ODD)) + + me = my_pe() + call unlex(me, DIM, i_pe, NPE) + + decomp%act%L = L + decomp%act%NPE = NPE + decomp%act%N = N + decomp%act%NH = NH + decomp%act%i_pe = i_pe + decomp%act%bc_fermions = bc_fermions + decomp%gamma_index = gamma_index + decomp%direction = decomp_direction + + do j = 1, DIM + i = gamma_index(j) + + decomp%std%L(i) = L(j) + decomp%std%NPE(i) = NPE(j) + decomp%std%N(i) = N(j) + decomp%std%i_pe(i) = i_pe(j) + decomp%std%bc_fermions(i) = bc_fermions(j) + enddo + + decomp%std%NH(1) = decomp%std%N(1) / 2 + decomp%std%NH(2) = decomp%std%N(2) + decomp%std%NH(3) = decomp%std%N(3) + decomp%std%NH(4) = decomp%std%N(4) + + + decomp%std%i = 0 + decomp%act%i = 0 + + do t = 0, decomp%act%N(4) - 1 + do z = 0, decomp%act%N(3) - 1 + do y = 0, decomp%act%N(2) - 1 + do x = 0, decomp%act%N(1) - 1 + + x_act(1) = x + x_act(2) = y + x_act(3) = z + x_act(4) = t + + x_std(gamma_index(1)) = x_act(1) + x_std(gamma_index(2)) = x_act(2) + x_std(gamma_index(3)) = x_act(3) + x_std(gamma_index(4)) = x_act(4) + + i_std = ieo(DIM, x_std, decomp%std%N) + 1 + i_act = ieo(DIM, x_act, decomp%act%N) + 1 + eo = e_o(x_act) + + decomp%std%i(i_act, eo) = i_std + decomp%act%i(i_std, eo) = i_act + enddo + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine init_module_lattice_io() + + use module_decomp + use module_lattice_io + implicit none + + L = decomp%std%L + N = decomp%std%N + NH = decomp%std%NH + NPE = decomp%std%NPE + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/iteration_count.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/iteration_count.F90 new file mode 100644 index 0000000000000000000000000000000000000000..da324c4e50424ad482a3bb5384164393552af0c1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/iteration_count.F90 @@ -0,0 +1,72 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! iteration_count.F90 - counts iterations in Hasenbusch improvement, +! does not work with tempering +! +!------------------------------------------------------------------------------- +module module_iteration_count + + integer, save :: it_f1 = 0 ! iterations in dsf1() + integer, save :: it_f2 = 0 ! iterations in dsf2() +end + +!----------------------------------------------------------------------------- +subroutine iteration_count_f1(iter) + + use module_iteration_count + implicit none + integer :: iter + + it_f1 = it_f1 + iter +end + +!----------------------------------------------------------------------------- +subroutine iteration_count_f2(iter) + + use module_iteration_count + implicit none + integer :: iter + + it_f2 = it_f2 + iter +end + +!----------------------------------------------------------------------------- +subroutine iteration_count_write(unit) + + use module_counter + use module_function_decl + use module_iteration_count + use module_switches + + implicit none + + integer, intent(in) :: unit + integer, save :: written = 0 + + character(*), parameter :: key = "%it" + character(*), parameter :: fmt_h = "(1x, 2a, a6, 2a16)" + character(*), parameter :: fmt_b = "(1x, a4, i6, 2i16)" + + if (switches%hasenbusch) then + + if (written == 0 .and. my_pe() == 0) then + write(unit, fmt_h) "T", key, "traj", "iterations(F1)", "iterations(F2)" + endif + + if (my_pe() == 0) write(unit, fmt_b) key, counter%traj, it_f1, it_f2 + + endif + + written = written + 1 + it_f1 = 0 + it_f2 = 0 +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/m_tilde.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/m_tilde.F90 new file mode 100644 index 0000000000000000000000000000000000000000..9782c4ede8d1d0226bfa97cf24df93a1a00ee280 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/m_tilde.F90 @@ -0,0 +1,107 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! m_tilde.F90 - matrix multiplications involving : +! +! M~ := \tilde{M} +! M~+ := \tilde{M}^\dagger +! +! M~ = 1 - kappa^2 Deo Doe (Wilson fermions) +! M~ = Tee - kappa^2 Deo Inv(Too) Doe (Wilson fermions + clover) +! M~ = H - kappa^2 Deo Inv(H) Doe (Wilson fermions + external h) +! +! and +! +! \tilde{M}^\dagger =: M~+ = 1 - kappa^2 Doe+ Deo+ +! +! subroutine mtil: out = M~ in +! subroutine mtil_dag: out = M~+ in +! subroutine mtdagmt: out = (M~+ M~) in --> mtdagmt.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine mtil(out, in, para, conf) + + use typedef_hmc + use module_p_interface + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + + SPINCOL_FIELD, intent(out) :: out + SPINCOL_FIELD, intent(in) :: in + P_SPINCOL_FIELD, save :: tmp + REAL :: b + + + ALLOCATE_SC_FIELD(tmp) + + if (para%kappa /= ZERO) then + call d(ODD, EVEN, tmp, in, conf%u) + if (para%csw_kappa /= ZERO) call clover_mult_ao(conf%i(1,1,ODD), tmp, volh) + if (para%h /= ZERO) call h_mult_b(-para%h, tmp, volh) + call d(EVEN, ODD, out, tmp, conf%u) + endif + + b = -para%kappa**2 / (ONE + para%h**2) + + if (para%csw_kappa /= ZERO) then + call clover_mult_a(tmp, conf%a(1,1,EVEN), in, volh) + call sc_xpby(out, tmp, b) ! out = tmp - kappa**2 * out + else + call sc_xpby(out, in, b) ! out = in - kappa**2 * out + if (para%h /= ZERO) call h_mult_a(out, para%h, in, volh) + endif + +end + +!------------------------------------------------------------------------------- +subroutine mtil_dag(out, in, para, conf) + + use typedef_hmc + use module_p_interface + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + + SPINCOL_FIELD, intent(out) :: out + SPINCOL_FIELD, intent(in) :: in + P_SPINCOL_FIELD, save :: tmp + REAL :: b + + + ALLOCATE_SC_FIELD(tmp) + + if (para%kappa /= ZERO) then + call d_dag(ODD, EVEN, tmp, in, conf%u) + if (para%csw_kappa /= ZERO) call clover_mult_ao(conf%i(1,1,ODD), tmp, volh) + if (para%h /= ZERO) call h_mult_b(para%h, tmp, volh) + call d_dag(EVEN, ODD, out, tmp, conf%u) + endif + + b = -para%kappa**2 / (ONE + para%h**2) + + if (para%csw_kappa /= ZERO) then + call clover_mult_a(tmp, conf%a(1,1,EVEN), in, volh) + call sc_xpby(out, tmp, b) ! out = tmp - kappa**2 * out + else + call sc_xpby(out, in, b) ! out = in - kappa**2 * out + if (para%h /= ZERO) call h_mult_a(out, -para%h, in, volh) + endif + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/mc.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/mc.F90 new file mode 100644 index 0000000000000000000000000000000000000000..b63a456278c9dedbb0242dcdda866ad5ab444b29 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/mc.F90 @@ -0,0 +1,299 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! mc.F90 - Monte Carlo loop (including tempering) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module typedef_mc_temper + + type temper_out + REAL :: delta_h + integer :: pair + integer :: swapped + end type temper_out + +end module typedef_mc_temper + +!------------------------------------------------------------------------------- +subroutine mc(para, conf) + + use typedef_mc_temper + use typedef_para + use module_counter + use module_function_decl + use module_switches + use module_vol + implicit none + + type(type_para), intent(in) :: para + type(hmc_conf), dimension(MAX_TEMPER), intent(inout) :: conf + + type(hmc_out), dimension(MAX_TEMPER) :: out + type(temper_out), dimension(MAX_TEMPER) :: tmpr + integer :: i, j + integer :: iforce, itraj + integer :: force_accept = 0 + character(len = *), parameter :: key_fa = "%fa" + character(len = *), parameter :: key_mc = "%mc" + character(len = *), parameter :: key_swap = "%sw" + REAL :: plaq + REAL, external :: sf, sg + + if (switches%hmc_test) then + if (para%n_temper /= 1) call die("mc(): HMC-test: n_ensemble has to be 1") + call hmc_test(para%hmc(1), conf(1)) + return + endif + + force_accept = 1 + iforce = 0 + do while (counter%traj < 0 .and. counter%j_traj < para%ntraj) + + iforce = iforce + 1 + counter%traj = counter%traj + 1 + counter%j_traj = counter%j_traj + 1 + + if (iforce == 1 .and. my_pe() == 0) then + call begin(UREC, "ForceAcceptance") + write(UREC, 405) "T", key_fa, "i_fa", "e", & + "PlaqEnergy", "exp(-Delta_H)", "CGcalls", "CGitTot", "CGitMax" + endif + + do i = 1, para%n_temper + call hmc(para%hmc(i), conf(i), out(i), force_accept, HMC_TEST_NONE) + + plaq = out(i)%sg / (SIX * volume) + + if (my_pe() == 0) then + write(UREC, 415) key_fa, counter%traj, i, plaq, out(i)%exp_dh, & + out(i)%cg_ncall, out(i)%cg_niter_tot, out(i)%cg_niter_max + endif + enddo + enddo + + if (iforce > 0) then + call end_A(UREC, "ForceAcceptance") + endif + +405 format (1x, 2a, 2a5, 2a15, 3a8) +415 format (1x, a4, 2i5, 2f15.10, 3i8) + + + force_accept = 0 + itraj = 0 + do while (counter%traj >= 0 .and. counter%traj < para%maxtraj & + .and. counter%j_traj < para%ntraj) + + itraj = itraj + 1 + counter%traj = counter%traj + 1 + counter%j_traj = counter%j_traj + 1 + + if (itraj == 1 .and. my_pe() == 0) then + call begin(UREC, "MC") + + if (para%n_temper > 1) write(UREC, 450) & + "T", key_swap, "traj", "ie", "e1", "e2", "Delta_H", "Acc" + + write(UREC, 400) "T", key_mc, "traj", "e", "f", & + "PlaqEnergy", "exp(-Delta_H)", "Acc", & + "CGcalls", "CGitTot", "CGitMax" + endif + + do i = 1, para%n_temper + if (itraj == 1) then + if (switches%tempering .and. switches%dynamical) then + out(i)%sf = sf(para%hmc(i), conf(i)) + else + out(i)%sf = ZERO + endif + out(i)%sg = sg(conf(i)%u) + endif + + call hmc(para%hmc(i), conf(i), out(i), force_accept, HMC_TEST_NONE) + enddo + + if (para%n_temper > 1 .and. counter%traj > para%nstd) then + call temper(para%n_temper, para%swap_seq, para%hmc, conf, out, tmpr) + do i = 1, para%n_temper - 1 + if (my_pe() == 0) write(UREC, 460) key_swap, counter%traj, i, & + tmpr(i)%pair, tmpr(i)%pair + 1, tmpr(i)%delta_h, tmpr(i)%swapped + enddo + endif + + do i = 1, para%n_temper + j = conf(i)%former + + if (my_pe() == 0) write(UREC,410) key_mc, counter%traj, i, j, & + out(i)%sg / (SIX * volume), out(i)%exp_dh, out(i)%accepted, & + out(i)%cg_ncall, out(i)%cg_niter_tot, out(i)%cg_niter_max + + if (switches%measure_traces) & + call traces(para%hmc(i), conf(i), counter%traj, i, j) + + if (switches%measure_polyakov_loop) & + call polyakov_loop(conf(i), counter%traj, i, j) + + call cooling(conf(i)%u, counter%traj, i, j) + enddo + + if (para%nsave > 0) then + if (mod(counter%traj, para%nsave) == 0) then + call conf_write(.false., para, conf) + endif + endif + + enddo + + if (itraj > 0) then + call end_A(UREC, "MC") + endif + +400 format (1x, 2a, a6, 2a3, 2a15, a4, 3a8) +410 format (1x, a4, i6, 2i3, 2f15.10, i4, 3i8) + +450 format (1x, 2a, a6, 3a3, a18, a4) +460 format (1x, a4, i6, 3i3, f18.10, i4) + +end + +!------------------------------------------------------------------------------- +subroutine temper(n_temper, swap_seq, para, conf, action, tmpr) + + use typedef_hmc + use typedef_mc_temper + use module_function_decl + use module_p_interface + implicit none + + integer, intent(in) :: n_temper, swap_seq + type(hmc_para), dimension(n_temper), intent(in) :: para + type(hmc_conf), dimension(n_temper), intent(inout) :: conf + type(hmc_out), dimension(n_temper), intent(inout) :: action + type(temper_out), dimension(n_temper), intent(out) :: tmpr + + integer, dimension(n_temper) :: pair + integer :: i_pair, n_pair, i, j + integer :: swapped + REAL, dimension(2, 2) :: hf, hg + REAL, external :: sf, sg + REAL :: h_old, h_new, delta_h + REAL :: random + + if (n_temper == 1) return + + do i = 1, n_temper + conf(i)%former = i + enddo + + n_pair = n_temper - 1 + + call swap_sequence(swap_seq, pair, n_pair) + + do i_pair = 1, n_pair + i = pair(i_pair) + j = pair(i_pair) + 1 + + hg(1, 1) = para(i)%beta * action(i)%sg + hg(2, 2) = para(j)%beta * action(j)%sg + + hg(1, 2) = para(i)%beta * action(j)%sg + hg(2, 1) = para(j)%beta * action(i)%sg + + hf(1, 1) = action(i)%sf + hf(2, 2) = action(j)%sf + hf(1, 2) = sf(para(i), conf(j)) + hf(2, 1) = sf(para(j), conf(i)) + + if (para(i)%kappa == ZERO) then + if (hf(1, 1) /= ZERO) call die("temper(): hf(1, 1) /= 0 ") + if (hf(2, 1) /= ZERO) call die("temper(): hf(2, 1) /= 0 ") + if (hf(1, 2) /= ZERO) call die("temper(): hf(1, 2) /= 0 ") + if (hf(2, 2) /= ZERO) call die("temper(): hf(2, 2) /= 0 ") + endif + + h_old = hg(1, 1) + hf(1, 1) + hg(2, 2) + hf(2, 2) + h_new = hg(1, 2) + hf(1, 2) + hg(2, 1) + hf(2, 1) + + delta_h = h_new - h_old + + if (ranf() < exp(-delta_h)) then + swapped = 1 + else + swapped = 0 + endif + + if (swapped /= 0) then + call swap_p_g_field(conf(i)%u, conf(j)%u) + call swap_p_sc_field(conf(i)%phi, conf(j)%phi) + call swap_integer(conf(i)%former, conf(j)%former) + call swap_real(action(i)%sg, action(j)%sg) + action(i)%sf = hf(1, 2) + action(j)%sf = hf(2, 1) + endif + + tmpr(i_pair)%pair = i + tmpr(i_pair)%swapped = swapped + tmpr(i_pair)%delta_h = delta_h + enddo + + call check_former(n_temper, conf) + +end + +!------------------------------------------------------------------------------- +subroutine check_former(n_temper, conf) + + use typedef_hmc + use module_function_decl + implicit none + + integer, intent(in) :: n_temper + type(hmc_conf), intent(inout) :: conf(n_temper) + integer :: count(n_temper), i + + count = 0 + do i = 1, n_temper + count(conf(i)%former) = count(conf(i)%former) + 1 + enddo + + do i = 1, n_temper + if (count(i) /= 1) call die("check_former(): error") + enddo +end + +!------------------------------------------------------------------------------- +subroutine swap_sequence(type, s, n) + + implicit none + integer, intent(in) :: type, n + integer, intent(out) :: s(n) + integer :: i + + select case (type) + case (SWAP_UP) + do i = 1, n + s(i) = i + enddo + case (SWAP_DOWN) + do i = 1, n + s(i) = n - i + 1 + enddo + case (SWAP_RANDOM) + call random_sequence(s, n) + case default + call die("swap_sequence(): don't know how to build the sequence") + end select + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/misc.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/misc.F90 new file mode 100644 index 0000000000000000000000000000000000000000..8bfc895a5a6841503e93170fdfa2ca5d63366f24 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/misc.F90 @@ -0,0 +1,225 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! misc.F90 - miscellaneous (service routines) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine die(msg) ! write "msg" to stderr and abort + + implicit none + character(len = *) msg + + write(STDERR,*) msg + call abbruch() + +end + +!------------------------------------------------------------------------------- +subroutine warn(msg) ! write "msg" to stderr and unit UREC + implicit none + character(len = *) msg + + write(STDERR,*) "WARNING: ", msg, " !!!" + write(UREC,*) "WARNING: ", msg, " !!!" +end + +!------------------------------------------------------------------------------- +subroutine assertion_failed(file, line, condition) !// used by ASSERT makro + implicit none + character(len = *) file, condition + integer line + + character(len = 8) aline + + write(aline, *) line + + call die(file // " (" // trim(aline) // "): assertion failed: " // condition) + +end + +!------------------------------------------------------------------------------- +subroutine begin(unit, str) ! write "begin marker" + + use module_function_decl + implicit none + integer :: unit + character(len = *) :: str + + if (my_pe() == 0) write(unit, "(1x,2a)") ">Begin", str + +end + +!------------------------------------------------------------------------------- +subroutine end_A(unit, str) ! write "end marker" + + use module_function_decl + implicit none + integer :: unit + character(len = *) :: str + + if (my_pe() == 0) write(unit, "(1x,2a)") ">End", str + +end + +!------------------------------------------------------------------------------- +function datum() ! returns date as: YYYY-MM-DD + + implicit none + character(len = 10) :: datum + + call date_and_time(date = datum) + + datum(9:10) = datum(7:8) + datum(6:7) = datum(5:6) + + datum(5:5) = "-" + datum(8:8) = "-" + +end + +!------------------------------------------------------------------------------- +function uhrzeit() ! returns time as: hh:mm:ss.sss + + implicit none + character(len = 12) :: uhrzeit + + call date_and_time(time = uhrzeit) + + uhrzeit(7:12) = uhrzeit(5:10) + uhrzeit(4:5) = uhrzeit(3:4) + + uhrzeit(3:3) = ":" + uhrzeit(6:6) = ":" + +end + +!------------------------------------------------------------------------------- +function f_exist(file) ! check in file exists + + implicit none + logical :: f_exist + character(len = *) :: file + + inquire(file = file, exist = f_exist) + +end + +!------------------------------------------------------------------------------- +subroutine open_diag() ! open debug file on each process + + use module_function_decl + implicit none + FILENAME :: name + + write(name, '(i4.4)') my_pe() + name = 'diag.' // name + + open(UDIAG, file = name) + + write(UDIAG,*) 'Output from PE ', my_pe() + write(UDIAG,*) '~~~~~~~~~~~~~~~~~~~' + write(UDIAG,*) + +end + +!------------------------------------------------------------------------------- +subroutine pos_keyword(unit, keyword) ! positions unit at keyword + + implicit none + integer, intent(in) :: unit + character(len = *), intent(in) :: keyword + integer :: iostat + character(len = len(keyword) + 1) :: word + + iostat = 0 + do while (iostat == 0) + read(unit, *, iostat = iostat) word + if (word == keyword) then + backspace(unit) + return + endif + enddo + + call die("pos_keyword(): " // keyword // ": not found") + +end + +!------------------------------------------------------------------------------- +subroutine read_keyword_int(unit, keyword, int, dim) ! read integer(s) at keyw. + + implicit none + integer, intent(in) :: unit, dim + character(len = *), intent(in) :: keyword + integer, intent(out) :: int(dim) + character :: c + + call pos_keyword(unit, keyword) + read(unit, *) c, int + +end + +!------------------------------------------------------------------------------- +subroutine read_keyword_REAL(unit, keyword, x, dim) ! read float(s) at keyword + + implicit none + integer, intent(in) :: unit, dim + character(len = *), intent(in) :: keyword + REAL, intent(out) :: x(dim) + character :: c + + call pos_keyword(unit, keyword) + read(unit, *) c, x + +end + +!------------------------------------------------------------------------------- +subroutine swap_endian8(n, a) + + implicit none + integer, parameter :: i8 = 8 + integer, intent(in) :: n + integer :: i + integer(i8), intent(inout) :: a(n) + integer(i8), save :: mask1, mask2, mask3, mask4, & + mask5, mask6, mask7, mask8 + integer(i8) :: tmp + logical, external :: is_big_endian + + data mask1 /z'00000000000000FF'/ + data mask2 /z'000000000000FF00'/ + data mask3 /z'0000000000FF0000'/ + data mask4 /z'00000000FF000000'/ + data mask5 /z'000000FF00000000'/ + data mask6 /z'0000FF0000000000'/ + data mask7 /z'00FF000000000000'/ + data mask8 /z'FF00000000000000'/ + + + if (is_big_endian()) return + + do i = 1, n + tmp = 0_i8 + tmp = ior(tmp, ishft(iand(a(i), mask1), 56_i8)) + tmp = ior(tmp, ishft(iand(a(i), mask2), 40_i8)) + tmp = ior(tmp, ishft(iand(a(i), mask3), 24_i8)) + tmp = ior(tmp, ishft(iand(a(i), mask4), 8_i8)) + tmp = ior(tmp, ishft(iand(a(i), mask5), -8_i8)) + tmp = ior(tmp, ishft(iand(a(i), mask6),-24_i8)) + tmp = ior(tmp, ishft(iand(a(i), mask7),-40_i8)) + tmp = ior(tmp, ishft(iand(a(i), mask8),-56_i8)) + a(i) = tmp + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..884239c63b5f8b05e93c5aed91fcf2ffae14bb05 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/Makefile @@ -0,0 +1,59 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 2003-2006, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# modules/Makefile +# +#=============================================================================== + +include ../Makefile.defs + +.SUFFIXES: +.SUFFIXES: .a .o .F90 + +.F90.o: + $(FPP) -I.. $(FPPFLAGS) $< > $*.f90 + $(F90) -c $(FFLAGS) $*.f90 + +MODULES_DIR = . + +MODULES = \ + typedef_cksum.o \ + typedef_clover.o \ + typedef_flags.o \ + typedef_hmc.o \ + typedef_para.o \ + module_bqcd.o \ + module_counter.o \ + module_d21.o \ + module_decomp.o \ + module_hmc_forces.o \ + module_lattice.o \ + module_lattice_io.o \ + module_nn.o \ + module_nnpe.o \ + module_offset.o \ + module_thread.o \ + module_vol.o \ + module_conf_info.o \ + module_function_decl.o \ + module_input.o \ + module_switches.o \ + module_p_interface.o \ + module_mre.o + +modules: $(MODULES) + +fast: + $(MAKE) + +clean: + rm -f *.[Tiod] *.f90 *.mod work.pc work.pcl + +clobber: clean diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/README b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/README new file mode 100644 index 0000000000000000000000000000000000000000..2edcd731ef7bc59540583b4442f0463e6cd88ff9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/README @@ -0,0 +1,23 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! modules/README +! +!------------------------------------------------------------------------------- + +This directory contain "modules" that are needed in more than one +source files. + +"modules" that are needed only in one file are kept in that file. + +For historical reasons some information is stored in more than one module. +Many "modules" were "common blocks" in older versions of the programme. + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/bqcd.pcl b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/bqcd.pcl new file mode 100644 index 0000000000000000000000000000000000000000..7f6c06e225907b3e12cb0daddf00f6b753a7ab89 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/bqcd.pcl @@ -0,0 +1 @@ +work.pc diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_bqcd.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_bqcd.F90 new file mode 100644 index 0000000000000000000000000000000000000000..ad4b746f7d9aac4a9008bbd0818d0ddd36d65710 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_bqcd.F90 @@ -0,0 +1,22 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_bqcd.F90 +! +!------------------------------------------------------------------------------- +module module_bqcd + + character(len = *), parameter :: prog_name = "bqcd" + character(len = *), parameter :: prog_version = "benchmark2" + integer, parameter :: input_version = 4 + integer, parameter :: conf_info_version = 3 + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_conf_info.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_conf_info.F90 new file mode 100644 index 0000000000000000000000000000000000000000..81f8b1e214c3ca217eeb5037f6459fdf69d7f785 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_conf_info.F90 @@ -0,0 +1,47 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_conf_info.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_conf_info + + type type_conf_info + REAL, dimension(2) :: beta, kappa, csw, csw_kappa, h + REAL :: plaq + integer, dimension(DIM) :: L, bc_fermions + integer :: rkind + integer, dimension(2) :: ensemble + end type type_conf_info + + character(len = *), parameter :: k_format = "Format" + character(len = *), parameter :: k_prog = "Program" + character(len = *), parameter :: k_run = "Run" + character(len = *), parameter :: k_traj = "Traj" + character(len = *), parameter :: k_host = "Host" + character(len = *), parameter :: k_date = "Date" + character(len = *), parameter :: k_L = "L" + character(len = *), parameter :: k_bc = "bc_fermions" + character(len = *), parameter :: k_rkind = "REAL_kind" + character(len = *), parameter :: k_plaq = "PlaqEnergy" + + character(len = *), parameter, dimension(2) :: & + k_ensemble = (/ "ensemble ", "former_ensemble " /), & + k_beta = (/ "beta ", "former_beta " /), & + k_kappa = (/ "kappa ", "former_kappa " /), & + k_csw = (/ "csw ", "former_csw " /), & + k_csw_kappa = (/ "csw_kappa ", "former_csw_kappa" /), & + k_h = (/ "h ", "former_h " /) + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_counter.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_counter.F90 new file mode 100644 index 0000000000000000000000000000000000000000..f539c06f9a7af052abcdc1e5acba841021ebea9a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_counter.F90 @@ -0,0 +1,30 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_counter.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_counter + + type type_counter + sequence + integer :: run + integer :: job + integer :: traj ! overall trajectory counter + integer :: j_traj ! job trajectory counter + end type type_counter + + type(type_counter), save :: counter + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_d21.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_d21.F90 new file mode 100644 index 0000000000000000000000000000000000000000..d35a29d17f3dd7396bae9caeff19d5947c76a54e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_d21.F90 @@ -0,0 +1,21 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_d21.F90 - Two component spincol field used in d/D21.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_d21 + P_SC2_FIELD, save :: a +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_decomp.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_decomp.F90 new file mode 100644 index 0000000000000000000000000000000000000000..677e085e15f2e80d18840a0a86471bc7ae810657 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_decomp.F90 @@ -0,0 +1,36 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_decomp.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_decomp + + type type_decomp1 + integer, dimension(DIM) :: L, N, NH, NPE + integer, dimension(DIM) :: i_pe + integer, dimension(DIM) :: bc_fermions + INTEGER, dimension(:, :), pointer :: i + end type type_decomp1 + + type type_decomp2 + type(type_decomp1) :: std ! "standard" + type(type_decomp1) :: act ! "actual" (essentially module_lattice) + integer, dimension(DIM) :: gamma_index + integer, dimension(DIM) :: direction + end type type_decomp2 + + type(type_decomp2), save :: decomp + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_function_decl.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_function_decl.F90 new file mode 100644 index 0000000000000000000000000000000000000000..4e4afc9bc67f8d465b82de35223322c32795dc96 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_function_decl.F90 @@ -0,0 +1,67 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_function_decl.F90 - declaration of functions +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_function_decl + + ! functions in misc.F90: + + character(len=10), external :: datum + character(len=12), external :: uhrzeit + character(len=20), external :: rechner + logical, external :: f_exist + + ! processes (-> comm/pes*.F90): + + integer, external :: num_pes + +#ifdef CRAY + integer, intrinsic :: my_pe ! gives the same id as MPI_Rank in MPI_COMM_WORLD +#else + integer, external :: my_pe +#endif + + ! global reduction (-> comm/reduction_*.F90) + + REAL, external :: dotprod + REAL, external :: global_sum + real, external :: global_min + real, external :: global_max + + ! sc-field (-> sc.F90) + + REAL, external :: sc_norm2 + REAL, external :: sc_dot + COMPLEX, external :: sc_cdotc + + ! ranf: + +#ifdef CRAY + real(8), intrinsic :: ranf +#else + real(8), external :: ranf +#endif + + ! identification of D (-> d/DVersion.F90) + + integer, external :: version_of_d + integer, external :: get_d3_buffer_vol + + ! communication method (-> comm/comm_*.F90) + + COMM_METHOD, external :: comm_method + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_hmc_forces.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_hmc_forces.F90 new file mode 100644 index 0000000000000000000000000000000000000000..c4cd3dbabe5abd6a14c6392d166c44a6de32ac40 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_hmc_forces.F90 @@ -0,0 +1,33 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_hmc_forces.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_hmc_forces + + P_GENERATOR_FIELD, save :: p_old + + integer, parameter :: n_force = 4 + + integer, parameter :: i_sg = 1 + integer, parameter :: i_sd = 2 + integer, parameter :: i_sf1 = 3 + integer, parameter :: i_sf2 = 4 + + REAL, save :: f_count(n_force) + REAL, save :: f_avg(n_force) + REAL, save :: f_max(n_force) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_input.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_input.F90 new file mode 100644 index 0000000000000000000000000000000000000000..96aede4988f919ae9b63c9745e0535a8b4a0a74c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_input.F90 @@ -0,0 +1,174 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_input.F90 +! +! Simple module to read "keyword value" formatted input. The keywords +! are defined in "module_input.h" together with a type definition and a +! default value. The input is stored in the structure "input". The +! components are called "input%". +! +! New input/keywords can be added or modified in "module_input.h". It +! suffices to modify "module_input.h" execpt for input of type +! INPUT_ARRAY_ENSEMBLES for which (re)allocation and initialisation +! routines have to be provided in this file (in subroutines "input_allocate" +! and "input_reallocate"). +! +!------------------------------------------------------------------------------- +# include "defs.h" + +# define INPUT_ARRAY_DIM integer, dimension(DIM) +# define INPUT_ARRAY_ENSEMBLES character(para_len), dimension(:), pointer +# define INPUT_DEFAULT_ENSEMBLES 1 + +!------------------------------------------------------------------------------- +module module_input + + implicit none + + !----------------------------------------------------------------------------- + public :: type_input ! data structure declaration + public :: input ! variable containing data structure + public :: input_read ! subroutine input_read(file) + public :: input_dump ! subroutine input_dump(unit) + !----------------------------------------------------------------------------- + + private + + integer, parameter :: comment_len = 80 + integer, parameter :: keyword_len = 32 + integer, parameter :: para_len = 32 + integer, parameter :: word_len = 8 + + type type_input + +#define INPUT_INPUT(var, type, default) type :: var +#include "module_input.h" + + end type type_input + + type(type_input), save :: input + +contains + + !============================================================================= + subroutine input_read(file) + + character(*), intent(in) :: file + integer :: iostat + character(keyword_len) :: keyword + + call input_allocate(INPUT_DEFAULT_ENSEMBLES) + call input_defaults() + + open(UINPUT, file = file, action = "read", status = "old") + + do + read(UINPUT, *, iostat = iostat) keyword + if (iostat /= 0) exit + if (keyword(1:1) == '#') cycle + backspace(UINPUT) + select case (keyword) + + +#undef INPUT_INPUT +#define INPUT_INPUT(var, type, default) \ + case (#var); read(UINPUT, *) keyword, input% ## var +#include "module_input.h" + + case default + call die("input_read(): " // trim(keyword) & + // ": unknown keyword") + + end select + + if (keyword == "ensembles") then + call input_reallocate(input%ensembles) + endif + + enddo + close(UINPUT) + + end subroutine input_read + + !----------------------------------------------------------------------------- + subroutine input_dump(unit) + integer :: unit + + if (.not. associated(input%beta)) then + call input_allocate(INPUT_DEFAULT_ENSEMBLES) + call input_defaults() + endif + +#undef INPUT_INPUT +#define INPUT_INPUT(var, type, default) write(unit,*) #var, " ", input% ## var +#include "module_input.h" + + end subroutine input_dump + + !----------------------------------------------------------------------------- + subroutine input_defaults() + +#undef INPUT_INPUT +#define INPUT_INPUT(var, type, default) input% ## var = default +#include "module_input.h" + + end subroutine input_defaults + + !----------------------------------------------------------------------------- + subroutine input_allocate(size) + + integer :: size + + allocate(input%beta(size)) + allocate(input%kappa(size)) + allocate(input%csw(size)) + allocate(input%h(size)) + allocate(input%hmc_trajectory_length(size)) + allocate(input%hmc_steps(size)) + allocate(input%hmc_rho(size)) + allocate(input%hmc_m_scale(size)) + allocate(input%start_info_file(size)) + + input%beta = "0.0" + input%kappa = "0.0" + input%csw = "0.0" + input%h = "0.0" + input%hmc_trajectory_length = "1" + input%hmc_steps = "0" + input%hmc_rho = "0.0" + input%hmc_m_scale = "1" + input%start_info_file = "" + + end subroutine input_allocate + + + !----------------------------------------------------------------------------- + subroutine input_reallocate(size) + + integer :: size + + if (size > INPUT_DEFAULT_ENSEMBLES) then + deallocate(input%beta) + deallocate(input%kappa) + deallocate(input%csw) + deallocate(input%h) + deallocate(input%hmc_trajectory_length) + deallocate(input%hmc_steps) + deallocate(input%hmc_rho) + deallocate(input%hmc_m_scale) + deallocate(input%start_info_file) + + call input_allocate(size) + endif + end subroutine input_reallocate + +end module module_input +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_input.h b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_input.h new file mode 100644 index 0000000000000000000000000000000000000000..a0aa2e325556644801432584b2d10bafc0a804d6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_input.h @@ -0,0 +1,42 @@ +INPUT_INPUT(run, integer, 0) +INPUT_INPUT(comment, character(comment_len), "") + +INPUT_INPUT(lattice, INPUT_ARRAY_DIM, (/4, 4, 4, 4/)) +INPUT_INPUT(processes, INPUT_ARRAY_DIM, (/1, 1, 1, 1/)) +INPUT_INPUT(boundary_conditions_fermions, INPUT_ARRAY_DIM, (/1, 1, 1, -1/)) +INPUT_INPUT(gamma_index, INPUT_ARRAY_DIM, (/1, 2, 3, 4/)) + +INPUT_INPUT(ensembles, integer, INPUT_DEFAULT_ENSEMBLES) + +INPUT_INPUT(beta, INPUT_ARRAY_ENSEMBLES, "0.0") +INPUT_INPUT(kappa, INPUT_ARRAY_ENSEMBLES, "0.0") +INPUT_INPUT(csw, INPUT_ARRAY_ENSEMBLES, "0.0") +INPUT_INPUT(h, INPUT_ARRAY_ENSEMBLES, "0.0") + +INPUT_INPUT(tempering_swap_sequence, character(word_len), "random") +INPUT_INPUT(tempering_steps_without, integer, 0) + +INPUT_INPUT(hmc_model, character, "A") +INPUT_INPUT(hmc_trajectory_length, INPUT_ARRAY_ENSEMBLES, "1") +INPUT_INPUT(hmc_steps, INPUT_ARRAY_ENSEMBLES, "0") +INPUT_INPUT(hmc_rho, INPUT_ARRAY_ENSEMBLES, "0.0") +INPUT_INPUT(hmc_m_scale, INPUT_ARRAY_ENSEMBLES, "1") +INPUT_INPUT(hmc_accept_first, integer, 0) +INPUT_INPUT(hmc_test, integer, 0) + +INPUT_INPUT(start_configuration, character(word_len), "cold") +INPUT_INPUT(start_info_file, INPUT_ARRAY_ENSEMBLES, "") +INPUT_INPUT(start_random, character(para_len), "default") + +INPUT_INPUT(mc_total_steps, integer, 1) +INPUT_INPUT(mc_steps, integer, 1) +INPUT_INPUT(mc_save_frequency, integer, 0) + +INPUT_INPUT(solver_rest, character(para_len), "1e-8") +INPUT_INPUT(solver_maxiter, integer, 100) +INPUT_INPUT(solver_ignore_no_convergence, integer, 0) +INPUT_INPUT(solver_mre_vectors, integer, 0) + +INPUT_INPUT(measure_cooling_list, FILENAME, "") +INPUT_INPUT(measure_polyakov_loop, integer, 0) +INPUT_INPUT(measure_traces, integer, 0) diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_lattice.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_lattice.F90 new file mode 100644 index 0000000000000000000000000000000000000000..2cf8dcdc283a5f2e9fda4ca98e9f3ef3cbb48276 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_lattice.F90 @@ -0,0 +1,48 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_lattice.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_lattice + + !>> See also "module_lattice_io" !!! + + ! use a common block, because without, equivalence leads to errors + ! with Intel Fortran90 compiler + + integer, dimension(DIM) :: L, N, NH, NPE + + common /common_lattice/ L, N, NH, NPE + + integer :: LX, LY, LZ, LT + integer :: NX, NY, NZ, NT, NXH + + equivalence (L(1), LX) + equivalence (L(2), LY) + equivalence (L(3), LZ) + equivalence (L(4), LT) + + equivalence (N(1), NX) + equivalence (N(2), NY) + equivalence (N(3), NZ) + equivalence (N(4), NT) + + equivalence (NH(1), NXH) + + integer, dimension(DIM), save :: bc_fermions + integer, dimension(DIM), save :: gamma_index + integer, dimension(DIM), save :: decomp_direction + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_lattice_io.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_lattice_io.F90 new file mode 100644 index 0000000000000000000000000000000000000000..492243725ea4bc60c781687d73d1afb3d6cd9765 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_lattice_io.F90 @@ -0,0 +1,47 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_lattice_io.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_lattice_io + + !>> The common block is syntactically identical to + !>> module_lattice/common_lattice. But it contains permutated + !>> values according to "gamma_index". + !>> This can be confusing when reading source code !!!! + + ! use a common block, because without, equivalence leads to errors + ! with Intel Fortran90 compiler + + integer, dimension(DIM) :: L, N, NH, NPE + + common /common_lattice_io/ L, N, NH, NPE + + integer :: LX, LY, LZ, LT + integer :: NX, NY, NZ, NT, NXH + + equivalence (L(1), LX) + equivalence (L(2), LY) + equivalence (L(3), LZ) + equivalence (L(4), LT) + + equivalence (N(1), NX) + equivalence (N(2), NY) + equivalence (N(3), NZ) + equivalence (N(4), NT) + + equivalence (NH(1), NXH) + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_mre.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_mre.F90 new file mode 100644 index 0000000000000000000000000000000000000000..21d58a534d941c6a6dffc5fcd636ba91d06c4cda --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_mre.F90 @@ -0,0 +1,34 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_mre.F90 +! +! Important: type(type_mre) must be defined with the "save" attribute! +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_mre + + integer, save :: mre_n_vec = 0 + + type mre_pointer_to_sc_field + P_SPINCOL_FIELD :: sc + end type mre_pointer_to_sc_field + + type type_mre + integer :: rank + type(mre_pointer_to_sc_field), dimension(:), pointer :: vec + end type type_mre + + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_nn.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_nn.F90 new file mode 100644 index 0000000000000000000000000000000000000000..20bcb9536e195c925e2ac1991c6ade199466cecc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_nn.F90 @@ -0,0 +1,23 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_nn.F90 - pointer to nearest neighbour list +! nn(volh_tot, EVEN:ODD, DIM, FWD:BWD) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_nn + + INTEGER, dimension(:, :, :, :), pointer, save :: nn + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_nnpe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_nnpe.F90 new file mode 100644 index 0000000000000000000000000000000000000000..4969e97237ab53e6d174ad888db59f265cc9489d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_nnpe.F90 @@ -0,0 +1,22 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_nnpe.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_nnpe + + integer, dimension (-1:1, -1:1, -1:1, -1:1), save :: nnpe + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_offset.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_offset.F90 new file mode 100644 index 0000000000000000000000000000000000000000..167f47a0b3fd95c7da599bc578ffb26f81b294a7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_offset.F90 @@ -0,0 +1,22 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_offset.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_offset + + integer, dimension (-1:1, -1:1, -1:1, -1:1), save :: offset + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_p_interface.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_p_interface.F90 new file mode 100644 index 0000000000000000000000000000000000000000..9163c7aa4113b65a4c87f3b9703a2596fbe708a3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_p_interface.F90 @@ -0,0 +1,80 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_p_interface.F90 ! interfaces of pointer manipulating routines +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_p_interface + + interface + + subroutine allocate_g_field(u) + P_GAUGE_FIELD :: u + end subroutine allocate_g_field + + subroutine allocate_g_field_io(u) + P_GAUGE_FIELD_IO :: u + end subroutine allocate_g_field_io + + subroutine allocate_gen_field(p) + P_GENERATOR_FIELD :: p + end subroutine allocate_gen_field + + subroutine allocate_sc_field(a) + P_SPINCOL_FIELD :: a + end subroutine allocate_sc_field + + subroutine allocate_sc_field_io(a) + P_SPINCOL_FIELD_IO :: a + end subroutine allocate_sc_field_io + + subroutine allocate_sc_overindexed(a) + P_SPINCOL_OVERINDEXED :: a + end subroutine allocate_sc_overindexed + + subroutine allocate_sc2_field(a) + P_SC2_FIELD :: a + end subroutine allocate_sc2_field + + subroutine allocate_clover_field_a(a) + use typedef_clover + P_CLOVER_FIELD_A :: a + end subroutine allocate_clover_field_a + + subroutine allocate_clover_field_b(b) + use typedef_clover + P_CLOVER_FIELD_B :: b + end subroutine allocate_clover_field_b + + subroutine swap_p_g_field(u, v) + P_GAUGE_FIELD :: u, v + end subroutine swap_p_g_field + + subroutine swap_p_sc_field(a, b) + P_SPINCOL_FIELD :: a, b + end subroutine swap_p_sc_field + + subroutine swap_p_clover_field_a(x, y) + use typedef_clover + P_CLOVER_FIELD_A :: x, y + end subroutine swap_p_clover_field_a + + subroutine swap_p_clover_field_b(x, y) + use typedef_clover + P_CLOVER_FIELD_B :: x, y + end subroutine swap_p_clover_field_b + + end interface + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_switches.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_switches.F90 new file mode 100644 index 0000000000000000000000000000000000000000..28543df8749de32397d153292b9a236cabeb03ee --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_switches.F90 @@ -0,0 +1,31 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_switches.F90 +! +!------------------------------------------------------------------------------- + +module module_switches + + type type_switches + logical :: quenched + logical :: dynamical + logical :: tempering + logical :: clover + logical :: h_ext + logical :: hasenbusch + logical :: hmc_test + logical :: measure_polyakov_loop + logical :: measure_traces + end type type_switches + + type (type_switches), save :: switches +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_thread.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_thread.F90 new file mode 100644 index 0000000000000000000000000000000000000000..bd51c2cfdea6bf1908a8533149139d42e0e62624 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_thread.F90 @@ -0,0 +1,26 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_thread.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_thread + + ! {xyz|t}_{start|end} (0:n_thread-1) + + integer, save :: n_thread + integer, dimension(:), pointer, save :: xyz_start, xyz_end + integer, dimension(:), pointer, save :: t_start, t_end + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_vol.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_vol.F90 new file mode 100644 index 0000000000000000000000000000000000000000..388675e2d141943ca2847e0886c36777d5a65f25 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_vol.F90 @@ -0,0 +1,22 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_vol.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_vol + + integer, save :: volume, vol, volh, volh_tot, size_sc_field + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_cksum.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_cksum.F90 new file mode 100644 index 0000000000000000000000000000000000000000..8338b75f7f099899f0737ee891e3d3466ad80e3f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_cksum.F90 @@ -0,0 +1,26 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! typedef_cksum.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module typedef_cksum + + type type_cksum + CHECK_SUM :: sum + CHECK_SUM :: bytes + FILENAME :: file + end type type_cksum + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_clover.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_clover.F90 new file mode 100644 index 0000000000000000000000000000000000000000..e580e642e82fd6c766eac46696784822c65f058d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_clover.F90 @@ -0,0 +1,85 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! typedef_clover.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module typedef_clover + + type type_clover_a + sequence + + REAL i11 + REAL i22 + + COMPLEX i12 + COMPLEX i13 + COMPLEX i14 + COMPLEX i15 + COMPLEX i16 + + COMPLEX i23 + COMPLEX i24 + COMPLEX i25 + COMPLEX i26 + + REAL i33 + REAL i44 + + COMPLEX i34 + COMPLEX i35 + COMPLEX i36 + + COMPLEX i45 + COMPLEX i46 + + REAL i55 + REAL i66 + + COMPLEX i56 + end type type_clover_a + + + type type_clover_b + sequence + + COMPLEX i21 + + COMPLEX i31 + COMPLEX i32 + + COMPLEX i41 + COMPLEX i42 + COMPLEX i43 + + COMPLEX i51 + COMPLEX i52 + COMPLEX i53 + COMPLEX i54 + + COMPLEX i61 + COMPLEX i62 + COMPLEX i63 + COMPLEX i64 + COMPLEX i65 + + REAL i11 + REAL i22 + REAL i33 + REAL i44 + REAL i55 + REAL i66 + end type type_clover_b + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_flags.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_flags.F90 new file mode 100644 index 0000000000000000000000000000000000000000..ca191bd165ace9d669cd785527c2802615243527 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_flags.F90 @@ -0,0 +1,26 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! typedef_flags.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module typedef_flags + + type type_flags + logical :: show_version + logical :: continuation_job + FILENAME :: input + end type type_flags + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_hmc.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_hmc.F90 new file mode 100644 index 0000000000000000000000000000000000000000..3e1a1d1662053d88c4fe08eb40ea6901490cdc62 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_hmc.F90 @@ -0,0 +1,69 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! typedef_hmc.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module typedef_hmc + + use typedef_clover + + type hmc_para + REAL :: beta + REAL :: kappa + REAL :: csw + REAL :: csw_kappa + REAL :: h + REAL :: traj_length + REAL :: tau + REAL :: rho + integer :: ntau + integer :: m_scale + character :: model + end type hmc_para + + type hmc_para_char + character(len = 20) :: beta + character(len = 20) :: kappa + character(len = 20) :: csw + character(len = 20) :: csw_kappa + character(len = 20) :: h + character(len = 20) :: traj_length + character(len = 20) :: tau + character(len = 20) :: ntau + character(len = 20) :: rho + character(len = 20) :: m_scale + end type hmc_para_char + + type hmc_out + REAL :: exp_dh ! exp(-Delta H) + REAL :: sg ! without factor beta + REAL :: sf + integer :: accepted + integer :: cg_ncall + integer :: cg_niter_max + integer :: cg_niter_tot + end type hmc_out + + type hmc_conf + P_GAUGE_FIELD :: u + P_SPINCOL_FIELD :: phi + P_SPINCOL_FIELD :: phi2 + P_CLOVER_FIELD_A :: a ! A := 1 - kappa c_sw sigma F (in 6x6 blocks) + P_CLOVER_FIELD_A :: i ! inverse of A + P_CLOVER_FIELD_B :: b ! inverse of A in (L D L+) decomposed form + integer :: former ! ensemble index before tempering + end type hmc_conf + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_para.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_para.F90 new file mode 100644 index 0000000000000000000000000000000000000000..70fee2ca94708b611e562a166648339961cf0d79 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_para.F90 @@ -0,0 +1,57 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! typedef_para.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module typedef_para + + use typedef_hmc + + type type_para + + integer :: run + + integer, dimension(DIM) :: L + integer, dimension(DIM) :: NPE + integer, dimension(DIM) :: bc_fermions + integer, dimension(DIM) :: gamma_index + + integer :: n_temper + + type(hmc_para), dimension(MAX_TEMPER) :: hmc + type(hmc_para_char), dimension(MAX_TEMPER) :: c_hmc + + integer :: start + SEED :: seed + integer :: swap_seq + + integer :: nforce + integer :: ntraj + integer :: nstd + integer :: maxtraj + + integer :: nsave + + real :: cg_rest + integer :: cg_maxiter + integer :: cg_log + + character(len = 20) c_cg_rest + + FILENAME, dimension(MAX_TEMPER) :: info_file + + end type type_para + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/mre.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/mre.F90 new file mode 100644 index 0000000000000000000000000000000000000000..4e9078d1d1bc97321380ae2abc4930816aa4b077 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/mre.F90 @@ -0,0 +1,270 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! mre.F90 - Chronological Inverter by Minimal Residual Extrapolation +! hep-lat/9509012 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine mre_put(basis, sc_field, reset) ! add a solution + + use module_mre + use module_p_interface + use module_vol + implicit none + + type(type_mre), intent(inout) :: basis + SPINCOL_FIELD, intent(in) :: sc_field + integer, intent(in) :: reset + + P_SPINCOL_FIELD :: tmp + integer :: i + + if (mre_n_vec == 0) then + return + endif + + call mre_allocate(basis) + + if (reset /= 0) then + basis%rank = 0 + return + endif + + tmp => basis%vec(mre_n_vec)%sc + + do i = mre_n_vec, 2, -1 + basis%vec(i)%sc => basis%vec(i - 1)%sc + enddo + + basis%vec(1)%sc => tmp + call sc_copy(basis%vec(1)%sc, sc_field) + + if (basis%rank < mre_n_vec) basis%rank = basis%rank + 1 + +end + +!------------------------------------------------------------------------------- +subroutine mre_get(basis, matrix_mult, trial, phi, para, conf) + + ! get trial solution + + use typedef_hmc + use module_function_decl + use module_mre + use module_vol + implicit none + + type(type_mre), intent(inout) :: basis + external :: matrix_mult + SPINCOL_FIELD, intent(out) :: trial + SPINCOL_FIELD, intent(in) :: phi + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + + type(type_mre), save :: mv ! "Matrix * v" + COMPLEX, target :: g(mre_n_vec, mre_n_vec + 1) + COMPLEX, pointer :: b(:) + integer :: size_g + integer :: i + integer :: j + integer :: s + integer :: c + integer :: rest + + if (mre_n_vec == 0 .or. .not. associated(basis%vec) .or. basis%rank == 0) then + call sc_copy(trial, phi) + return + endif + + if (basis%rank == 1) then + call sc_copy(trial, basis%vec(1)%sc) + return + endif + + size_g = mre_n_vec * (mre_n_vec + 1) * SIZE_COMPLEX + + b => g(:, mre_n_vec + 1) ! storage arrangement for global sum + ! => one global sum for everything + + call mre_allocate(mv) + call mre_gram_schmidt(basis) + + do i = 1, basis%rank + b(i) = sc_cdotc(basis%vec(i)%sc, phi) + call matrix_mult(mv%vec(i)%sc, basis%vec(i)%sc, para, conf) + enddo + + do i = 1, basis%rank + g(i, i) = sc_norm2(mv%vec(i)%sc) + do j = i + 1, basis%rank + g(i, j) = sc_cdotc(mv%vec(i)%sc, mv%vec(j)%sc) + g(j, i) = conjg(g(i, j)) + enddo + enddo + + call global_sum_vec(size_g, g) + + call mre_gauss_jordan(g, b, basis%rank, mre_n_vec) + + ! calculation of "trial" with doubled data re-use: + + call sc_cax2(trial, basis%vec(1)%sc, b(1), basis%vec(2)%sc, b(2)) + + rest = mod(basis%rank, 2) + + do j = 3, basis%rank - rest, 2 + call sc_caxpy2(trial, basis%vec(j)%sc, b(j), & + basis%vec(j+1)%sc, b(j+1)) + enddo + + if (rest == 1) then + j = basis%rank + call sc_caxpy(trial, basis%vec(j)%sc, b(j)) + endif + +end + +!------------------------------------------------------------------------------- +subroutine mre_allocate(basis) + + use module_mre + use module_p_interface + use module_vol + implicit none + + type(type_mre), intent(inout) :: basis + integer :: i + + if (.not. associated(basis%vec)) then + allocate(basis%vec(mre_n_vec)) + do i = 1, mre_n_vec + nullify(basis%vec(i)%sc) + call allocate_sc_field(basis%vec(i)%sc) + enddo + basis%rank = 0 + endif + +end + +!------------------------------------------------------------------------------- +subroutine mre_gram_schmidt(basis) + + ! Golub and van Loon, Matrix Computations (3rd ed.), p. 232 + + use module_function_decl + use module_mre + use module_vol + implicit none + + type(type_mre), intent(inout) :: basis + + integer :: k, j + REAL :: r_kk, r_kj + + do k = 1, basis%rank + r_kk = sc_norm2(basis%vec(k)%sc) + r_kk = global_sum(r_kk) + r_kk = ONE / sqrt(r_kk) + call sc_scale(basis%vec(k)%sc, r_kk) + do j = k + 1, basis%rank + r_kj = sc_dot(basis%vec(k)%sc, basis%vec(j)%sc) + r_kj = global_sum(r_kj) + call sc_axpy(basis%vec(j)%sc, basis%vec(k)%sc, -r_kj) + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine mre_gauss_jordan(a, b, n, np) + + ! Numerical Recipes in Fortran (2nd ed.), p. 30 + ! inv(a) i.e. a is not unscrambled + + implicit none + + integer, intent(in) :: n, np + COMPLEX, intent(inout) :: a(np,np), b(np) + + integer :: ipiv(n) + integer :: i, j, k, l, ll + integer :: irow, icol + + REAL :: big, tmp + COMPLEX :: dum + COMPLEX :: pivinv + + ipiv = 0 + + do i = 1, n + + big = ZERO + do j = 1, n + if (ipiv(j) /= 1) then + do k = 1, n + if (ipiv(k) == 0) then + tmp = abs(a(j, k)) + if (tmp >= big) then + big = tmp + irow = j + icol = k + endif + else if (ipiv(k) > 1) then + call die("mre_gauss_jordan(): singular matrix 1") + endif + enddo + endif + enddo + + ipiv(icol) = ipiv(icol) + 1 + + if (irow /= icol) then + do l = 1, n + dum = a(irow, l) + a(irow, l) = a(icol, l) + a(icol, l) = dum + enddo + dum = b(irow) + b(irow) = b(icol) + b(icol) = dum + endif + + if (a(icol, icol) == ZERO ) then + call die("mre_gauss_jordan(): singular matrix 2") + endif + + pivinv = ONE / a(icol, icol) + !!a(icol, icol) = ONE !! only needed for inv(a) + + do l = 1, n + a(icol, l) = a(icol, l) * pivinv + enddo + + b(icol) = b(icol) * pivinv + + do ll = 1, n + if (ll /= icol) then + dum = a(ll, icol) + a(ll, icol) = ZERO + do l = 1, n + a(ll, l) = a(ll, l) - a(icol, l) * dum + enddo + b(ll) = b(ll) - b(icol) * dum + endif + enddo + + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/mtdagmt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/mtdagmt.F90 new file mode 100644 index 0000000000000000000000000000000000000000..44e1323a953c59ed3eeebdb6865c41f1e64b4c67 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/mtdagmt.F90 @@ -0,0 +1,40 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! mtdagmt.F90 - -> see m_tilde.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine mtdagmt(out, in, para, conf) + + use typedef_hmc + use module_p_interface + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + + SPINCOL_FIELD :: out, in + P_SPINCOL_FIELD, save :: tmp + + TIMING_START(timing_bin_mtdagmt) + + ALLOCATE_SC_FIELD(tmp) + + call mtil(tmp, in, para, conf) + call mtil_dag(out, tmp, para, conf) + + TIMING_STOP(timing_bin_mtdagmt) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-EXPLAINED.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-EXPLAINED.var new file mode 100644 index 0000000000000000000000000000000000000000..cec45a8248829ba1fee2fbf339b0144c80da4fdd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-EXPLAINED.var @@ -0,0 +1,61 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-EXPLAINED.var +# +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +MODULES_FLAG = -I$(MODULES_DIR) # how to find "modules" + # MODULES_DIR is set in every Makefile + +FPP = cpp -C -P # Fortran preprocessor / ANSI C preprocessor +F90 = f90 # Fortran90 compiler +CC = cc # ANSI C compiler +AR = ar # ar command +RANLIB = echo # "ranlib" if necessary + +MYFLAGS = -DTIMING # or to switch time measurement off + # must always be set for benchmarking + -DD3_BUFFER_VOL=24*24*12*12 + # -sample for lattice='24 24 24 48',processes='1 1 2 4' + # - largest possible size of a local lattice boundary + # - determines the size of *static* arrays + # - only needed in libd3.a + # - should be set to 1 when using a different LIBD. + -D_OPENMP # has to be explicitly defined for OpenMP + +FFLAGS = -O3 # Fortran90 compiler flags + $(MODULES_FLAG) +CFLAGS = -O3 # C compiles flags +ARFLAGS = rv # ar flags + +LDFLAGS = # loader flags (the loader is: ${F90}) +SYSLIBS = # system libraries (e.g. for BLAS) + +FAST_MAKE = gmake -j 8 # parallel make + +CKSUM_O = cksum.o # do not change +RANDOM_O = ran.o ranf.o # or "ran.o" on Crays +UUU_O = uuu_f90.o # or "uuu_fwd.o uuu_bwd.o uuu_bwd_m.o" if C is + # faster than Fortran90 (was a small effect on T3E) + +LIBD = libd.a # Multiplication with Wilson hopping term: + # libd.a: Cray T3E version (MPI or shmem) + # libd2.a: Hitachi SR8000 version (MPI) + # libd3.a: Hitachi SR8000 version (MPI+OpenMP) + # libd21.a: version for high scalability + +LIBCOMM = lib_mpi.a # or "lib_single_pe.a" or "lib_shmempi.a" + +LIBCLOVER = libclover.a # do not change + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-altix.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-altix.var new file mode 100644 index 0000000000000000000000000000000000000000..baf01efce305d994685f8ed67ce8a0909ed49181 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-altix.var @@ -0,0 +1,108 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 2005, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-altix.var - settings on SGI-Altix +# +#------------------------------------------------------------------------------- + +timing = 1 +mpi = +omp = 1 +shmem = 1 +shmempi = 1 +debug = +libd = 2 +d3_buffer_vol = 32*32*16*16 + +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +FPP = mpif90 -g -E +FPP2 = icc -E -C -P +F90 = mpif90 +CC = mpicc +AR = ar +RANLIB = echo + +MODULES_FLAG = -I$(MODULES_DIR) + +MYFLAGS = -DINTEL -DALTIX +FFLAGS_STD= $(MODULES_FLAG) +CFLAGS_STD= -DNamesToLower_ +ARFLAGS = rv + +LDFLAGS = -Vaxlib +SYSLIBS = + +FAST_MAKE = gmake -j 8 + +CKSUM_O = cksum.o +RANDOM_O = ran.o ranf.o +UUU_O = uuu_f90.o + +LIBD = +LIBCOMM = lib_single_pe.a +LIBCLOVER = libclover.a + +#------------------------------------------------------------------------------- + +ifdef timing + MYFLAGS += -DTIMING +endif + +ifdef mpi + LIBCOMM = lib_mpi.a +endif + +ifdef omp + F90 += -openmp + MYFLAGS += -D_OPENMP +endif + +ifdef shmem + LDFLAGS += -lsma + LIBCOMM = lib_shmem.a +endif + +ifdef shmempi + LDFLAGS += -lsma + LIBCOMM = lib_shmempi.a +endif + +ifdef debug + FFLAGS = -g -O0 $(FFLAGS_STD) + CFLAGS = -g -O0 $(CFLAGS_STD) +else + FFLAGS = -O2 $(FFLAGS_STD) + CFLAGS = -O2 $(CFLAGS_STD) +endif + +ifeq ($(libd),1) + LIBD = libd.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),2) + LIBD = libd2.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),21) + LIBD = libd21.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),3) + LIBD = libd3.a + MYFLAGS += -DD3_BUFFER_VOL='$(d3_buffer_vol)' +endif + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-bgl.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-bgl.var new file mode 100644 index 0000000000000000000000000000000000000000..5841de7a5d1c09511eba640cae719b0e93016b2d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-bgl.var @@ -0,0 +1,105 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-bgl.var - settings on BlueGene/L +# +#------------------------------------------------------------------------------- + +timing = 1 +debug = +bits64 = +libd = 2 +d3_buffer_vol = 32*32*16*16 + +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +FPP = /opt/ibmcmp/vac/7.0/bin/blrts_xlc -E -C -P +F90 = /opt/ibmcmp/xlf/9.1/bin/blrts_xlf90 -qsuffix=f=f90 +CC = /opt/ibmcmp/vac/7.0/bin/blrts_xlc +AR = ar +RANLIB = echo + +BGLSYS = /bgl/BlueLight/ppcfloor/bglsys + +MODULES_FLAG = -I$(MODULES_DIR) + +MYFLAGS = -DIBM +FFLAGS_STD= $(MODULES_FLAG) -I$(BGLSYS)/include +CFLAGS_STD= -DLongLong -DNamesToLower +ARFLAGS = rv + +LDFLAGS = -L$(BGLSYS)/lib +SYSLIBS = -lmpich.rts -lfmpich.rts -lmsglayer.rts -lrts.rts -ldevices.rts + +FAST_MAKE = gmake -j 8 + +CKSUM_O = cksum.o +RANDOM_O = ran.o ranf.o +UUU_O = uuu_f90.o + +LIBD = +LIBCOMM = lib_mpi.a +LIBCLOVER = libclover.a + +#------------------------------------------------------------------------------- + +ifdef timing + MYFLAGS += -DTIMING +endif + +ifdef mpi + LIBCOMM = lib_mpi.a +endif + +ifdef omp + F90 += -qsmp=omp + MYFLAGS += -D_OPENMP +endif + +ifdef debug + FFLAGS = -g -qfullpath $(FFLAGS_STD) + CFLAGS = -g -qfullpath $(CFLAGS_STD) +else + FFLAGS = -O3 -qhot $(FFLAGS_STD) + CFLAGS = -O2 $(CFLAGS_STD) +endif + +ifdef bits64 + F90 += -q64 + CFLAGS += -q64 + ARFLAGS += -X64 +else + LDFLAGS += +endif + + +ifeq ($(libd),1) + LIBD = libd1.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),2) + LIBD = libd2.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),21) + LIBD = libd21.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),3) + LIBD = libd3.a + MYFLAGS += -DD3_BUFFER_VOL='$(d3_buffer_vol)' +endif + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-cray.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-cray.var new file mode 100644 index 0000000000000000000000000000000000000000..22aacaf584928dcf5fc43178b4da26caee0a25fe --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-cray.var @@ -0,0 +1,45 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-cray.var - settings on Cray T3E +# +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +MODULES_FLAG = -p $(MODULES_DIR) + +FPP = cpp -C -P +F90 = f90 +CC = cc +AR = ar +RANLIB = echo + +# "-M 801": suppress "unknown directive" messages (from OpenMP) + +MYFLAGS = -DTIMING -DD3_BUFFER_VOL=1 +FFLAGS = $(MODULES_FLAG) -g -M 801 +CFLAGS = -O3 +ARFLAGS = rv + +LDFLAGS = +SYSLIBS = + +FAST_MAKE = NPROC=4 make + +CKSUM_O = cksum.o +RANDOM_O = ran.o +UUU_O = uuu_fwd.o uuu_bwd.o uuu_bwd_m.o + +LIBD = libd.a +LIBCOMM = lib_mpi.a +LIBCLOVER = libclover.a + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hitachi-omp.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hitachi-omp.var new file mode 100644 index 0000000000000000000000000000000000000000..975c7e235ef3a96047908424f22d4b3f91b8af06 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hitachi-omp.var @@ -0,0 +1,44 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-hitachi-omp.var - settings on Hitachi SR8000 +# (fastest version: MPI +OpenMP) +# +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +MODULES_FLAG = -I$(MODULES_DIR) + +FPP = cpp -C -P -D_OPENMP +F90 = mpif90 +CC = cc +AR = ar +RANLIB = echo + +MYFLAGS = -DTIMING -DD3_BUFFER_VOL=24*24*12*12 +FFLAGS = $(MODULES_FLAG) -Oss -pvdiag -par=2 -pardiag=1 -omp -procnum=8 -nosave -contarea=2 +CFLAGS = -DLongLong +ARFLAGS = rv + +LDFLAGS = +BTLB -omp -rdma +SYSLIBS = /usr/local/lib/liblrz.a -lf90c -lpl + +FAST_MAKE = JOBTYPE=SS prun -p IAPAR gmake -j 8 + +CKSUM_O = cksum.o +RANDOM_O = ran.o ranf.o +UUU_O = uuu_f90.o + +LIBD = libd3.a +LIBCOMM = lib_mpi.a +LIBCLOVER = libclover.a + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hitachi.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hitachi.var new file mode 100644 index 0000000000000000000000000000000000000000..24a06ad305c32af72d9827b01d70d41356ac1dfa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hitachi.var @@ -0,0 +1,43 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-hitachi.var - settings on Hitachi SR8000 (pure MPI version) +# +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +MODULES_FLAG = -I$(MODULES_DIR) + +FPP = cpp -C -P +F90 = mpif90 +CC = cc +AR = ar +RANLIB = echo + +MYFLAGS = -DTIMING -DD3_BUFFER_VOL=1 +FFLAGS = $(MODULES_FLAG) -opt=ss -par=0 -contarea=2 +CFLAGS = -DLongLong +ARFLAGS = rv + +LDFLAGS = +BTLB +SYSLIBS = /usr/local/lib/liblrz.a -lf90c -lpl + +FAST_MAKE = JOBTYPE=SS prun -p IAPAR gmake -j 8 + +CKSUM_O = cksum.o +RANDOM_O = ran.o ranf.o +UUU_O = uuu_f90.o + +LIBD = libd2.a +LIBCOMM = lib_mpi.a +LIBCLOVER = libclover.a + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hp.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hp.var new file mode 100644 index 0000000000000000000000000000000000000000..a2050cfe4fdd0559ec513f6f1de864b193fbf9ca --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hp.var @@ -0,0 +1,45 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 2002-2003, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-hp.var - settings for HP-UX Fortran Compiler +# +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +MODULES_FLAG = -I$(MODULES_DIR) + +FPP = /opt/langtools/lbin/cpp.ansi -P +F90 = mpif90 ####+Oopenmp +CC = cc +AR = ar +RANLIB = ranlib + +MYFLAGS = -DTIMING -DD3_BUFFER_VOL=1 #####-D_OPENMP +FFLAGS = $(MODULES_FLAG) \ + +r8 +DD64 +DSnative +O3 +Ocache_pad_common \ + +Olibcalls +Onolimit +Ofltacc=relaxed +FPD +CFLAGS = -DLongLong -DNamesToLower_ +DD64 +DSnative +ARFLAGS = rv + +LDFLAGS = +O3 +DD64 +DSnative +U77 +SYSLIBS = -L/opt/mlib/lib/hpux64 -lveclib + +FAST_MAKE = gmake -j 1 + +CKSUM_O = cksum.o +RANDOM_O = ran.o ranf.o +UUU_O = uuu_f90.o + +LIBD = libd.a +LIBCOMM = lib_mpi.a +LIBCLOVER = libclover.a + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-ibm.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-ibm.var new file mode 100644 index 0000000000000000000000000000000000000000..8710c65d4cb392d964a86ebe422571a7c0d4adde --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-ibm.var @@ -0,0 +1,116 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 2002-2006, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-ibm.var - settings on IBM +# +#------------------------------------------------------------------------------- + +timing = 1 +mpi = 1 +omp = +debug = +bits64 = 1 +libd = 2 +d3_buffer_vol = 24*24*12*12 + +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +FPP = /usr/ccs/lib/cpp -C -P +F90 = xlf90_r -qsuffix=f=f90 +CC = cc_r +AR = ar +RANLIB = echo + +# suppressed messages: +# +# 1500-036 (I) The NOSTRICT option (default at OPT(3)) has the potential +# to alter the semantics of a program +# +# 1516-092 (E) If a statement function expression contains a reference +# to a function or a function dummy procedure, the +# reference must not require an explicit interface or be +# a transformational intrinsic + +MODULES_FLAG = -I$(MODULES_DIR) + +MYFLAGS = -DIBM +FFLAGS_STD= $(MODULES_FLAG) -qsuppress=1500-036:1516-092 #:1501-510:1516-092:1514-008 +CFLAGS_STD= -DLongLong -DNamesToLower +ARFLAGS = -r -v + +LDFLAGS = +SYSLIBS = + +FAST_MAKE = gmake -j 8 + +CKSUM_O = cksum.o +RANDOM_O = ran.o ranf.o +UUU_O = uuu_f90.o + +LIBD = +LIBCOMM = lib_single_pe.a +LIBCLOVER = libclover.a + +#------------------------------------------------------------------------------- + +ifdef timing + MYFLAGS += -DTIMING +endif + +ifdef mpi + F90 = mpxlf90_r -qsuffix=f=f90 + LIBCOMM = lib_mpi.a +endif + +ifdef omp + F90 += -qsmp=omp + MYFLAGS += -D_OPENMP +endif + +ifdef debug + FFLAGS = -g -qfullpath $(FFLAGS_STD) + CFLAGS = -g -qfullpath $(CFLAGS_STD) +else + FFLAGS = -O3 $(FFLAGS_STD) + CFLAGS = -O2 $(CFLAGS_STD) +endif + +ifdef bits64 + F90 += -q64 + CFLAGS += -q64 + ARFLAGS += -X64 +else + LDFLAGS += -bmaxdata:2000000000 -bmaxstack:250000000 +endif + + +ifeq ($(libd),1) + LIBD = libd1.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),2) + LIBD = libd2.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),21) + LIBD = libd21.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),3) + LIBD = libd3.a + MYFLAGS += -DD3_BUFFER_VOL='$(d3_buffer_vol)' +endif + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-intel.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-intel.var new file mode 100644 index 0000000000000000000000000000000000000000..e1d0443529a8d277e2acb4b5bade295e4e6c73cb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-intel.var @@ -0,0 +1,43 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 2002-2003, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-intel.var - settings for Intel Fortran Compiler +# +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +MODULES_FLAG = -cl,bqcd.pcl + +FPP = cpp -C -P +F90 = ifc +CC = gcc +AR = ar +RANLIB = echo + +MYFLAGS = -DINTEL -DTIMING -DD3_BUFFER_VOL=1 +FFLAGS = $(MODULES_FLAG) +CFLAGS = -DLongLong -DNamesToLower_ +ARFLAGS = rv + +LDFLAGS = -Vaxlib +SYSLIBS = + +FAST_MAKE = gmake + +CKSUM_O = cksum.o +RANDOM_O = ran.o ranf.o +UUU_O = uuu_f90.o + +LIBD = libd.a +LIBCOMM = lib_single_pe.a +LIBCLOVER = libclover.a + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-nec.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-nec.var new file mode 100644 index 0000000000000000000000000000000000000000..cfba0e88b8f2a10987e3f8e1ee7ac447d35fe20f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-nec.var @@ -0,0 +1,44 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-nec.var - settings on NEC SX-8 +# +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +MODULES_FLAG = I$(MODULES_DIR) + +FPP = cpp -C -P +F90 = sxmpif90 +CC = sxmpic++ +AR = sxar +RANLIB = echo + +MYFLAGS = -DTIMING -DD3_BUFFER_VOL=24*24*12*12 + +FFLAGS = -$(MODULES_FLAG) +CFLAGS = -DNamesToLower_ -DLongLong +ARFLAGS = rv + +LDFLAGS = +SYSLIBS = + +FAST_MAKE = make + +CKSUM_O = cksum.o +RANDOM_O = ran.o ranf.o +UUU_O = uuu_f90.o + +LIBD = libd2.a +LIBCOMM = lib_mpi.a +LIBCLOVER = libclover.a + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-sun.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-sun.var new file mode 100644 index 0000000000000000000000000000000000000000..82f8a6f9029f08895c6f89a5efbd031f63d3eb4b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-sun.var @@ -0,0 +1,44 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2001, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-sun.var - settings on Sun +# +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +MODULES_FLAG = -M$(MODULES_DIR) + +# use GNU C-preprocessor (well hidden on our Sun): + +FPP = /sw/sun4_56/egcs-1.1.2-2/lib/gcc-lib/sparc-sun-solaris2.6/egcs-2.91.66/cpp -C -P +F90 = /opt/SUNWspro/bin/f90 +CC = gcc +AR = ar +RANLIB = echo + +MYFLAGS = -DTIMING -DD3_BUFFER_VOL=1 +FFLAGS = -O3 $(MODULES_FLAG) +CFLAGS = -O3 -DNamesToLower_ -DLongLong +ARFLAGS = rv + +LDFLAGS = +SYSLIBS = + +FAST_MAKE = make + +RANDOM_O = ran.o ranf.o +UUU_O = uuu_f90.o + +LIBD = libd.a +LIBCOMM = lib_single_pe.a +LIBCLOVER = libclover.a + +#=============================================================================== \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-EXPLAINED.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-EXPLAINED.F90 new file mode 100644 index 0000000000000000000000000000000000000000..5b2ef70328aceb5b96ad1bd051601aa94c361981 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-EXPLAINED.F90 @@ -0,0 +1,47 @@ +!=============================================================================== +! +! service.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() ! exit with error status and shutdown parallel application + ! can be mpi_abort(1) when using MPI + call errexit() +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() ! returns CPU-seconds + ! SECONDS has to be defined in "defs.h" + sekunden = tsecnd() +end + +!------------------------------------------------------------------------------- +! Arguments from the command line (the following works on many machines): +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) ! iarg = 0 ==> command name + ! iarg > 0 ==> argument + implicit none + integer :: iarg, larg, status + character(len = *) :: arg + character(len(arg) + 1) :: a + + call getarg(iarg, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() ! ipxfargc = 0 ==> no arguments + ipxfargc = iargc() ! ipxfargc > 0 ==> number of arguments +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() ! .false. on e.g. Intel + is_big_endian = .true. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-altix.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-altix.F90 new file mode 100644 index 0000000000000000000000000000000000000000..cd3e165e433b54bbc27bec6d2e2862eb7f1a73d9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-altix.F90 @@ -0,0 +1,71 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-altix.F90 - calls to service routines on SGI-Altix +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + integer(4) status + status = 1 + call exit(status) + + !!include 'mpif.h' + !!integer errorcode, ierror + !!call mpi_abort(MPI_COMM_WORLD, errorcode, ierror) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + character(len = 32) r + + call hostnm(r) + rechner = r +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + include 'mpif.h' + + sekunden = mpi_wtime() +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + character(len = *) :: arg + character(100) :: a + + call getarg(iarg, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + ipxfargc = iargc() +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .false. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-bgl.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-bgl.F90 new file mode 100644 index 0000000000000000000000000000000000000000..38e13dbe956dd405104094bee16885626577ca99 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-bgl.F90 @@ -0,0 +1,72 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-bgl.F90 - calls to service routines on BlueGene/L +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + implicit none + include 'mpif.h' + + integer ierror + + call mpi_abort(MPI_COMM_WORLD, 1, ierror) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + + rechner = "jubl" +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + integer(8), external :: rts_get_timebase + real(8), parameter :: speed = ONE / 700000000.0_8 + + sekunden = rts_get_timebase() * speed +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + integer(4) :: i + character(len = *) :: arg + character(len(arg) + 1) :: a + + i = iarg + call getarg(i, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + integer(4) :: iargc + ipxfargc = iargc() +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .true. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-cray.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-cray.F90 new file mode 100644 index 0000000000000000000000000000000000000000..f7981db72b83e9a6be52318313ac9aca84379ac9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-cray.F90 @@ -0,0 +1,70 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-ibm.F90 - calls to service routines on IBM +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + integer(4) status + status = 1 + call exit(status) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + character(len = 32) r + + call hostnm(r) + rechner = r +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + real(8) rtc + + sekunden = rtc() !!!mclock() * 0.01 +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + integer(4) :: i + character(len = *) :: arg + character(len(arg) + 1) :: a + + i = iarg + call getarg(i, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + integer(4) :: iargc + ipxfargc = iargc() +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .true. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hitachi-omp.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hitachi-omp.F90 new file mode 100644 index 0000000000000000000000000000000000000000..55704528a43a2ce05db1ba16ea2c2f0f43da0520 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hitachi-omp.F90 @@ -0,0 +1,68 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-hitachi.F90 - calls to service routines on HITACHI +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + call fexit(1) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + + call hostnm(rechner) +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + implicit none + +!!real(8) d +!!call xclock(d, 5) +!!sekunden = d + + real(8) dwalltime + sekunden = dwalltime() +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + character(len = *) :: arg + character(len(arg) + 1) :: a + + call getarg(iarg + 1, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + ipxfargc = iargc() - 1 +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .true. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hitachi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hitachi.F90 new file mode 100644 index 0000000000000000000000000000000000000000..55704528a43a2ce05db1ba16ea2c2f0f43da0520 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hitachi.F90 @@ -0,0 +1,68 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-hitachi.F90 - calls to service routines on HITACHI +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + call fexit(1) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + + call hostnm(rechner) +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + implicit none + +!!real(8) d +!!call xclock(d, 5) +!!sekunden = d + + real(8) dwalltime + sekunden = dwalltime() +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + character(len = *) :: arg + character(len(arg) + 1) :: a + + call getarg(iarg + 1, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + ipxfargc = iargc() - 1 +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .true. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hp.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hp.F90 new file mode 100644 index 0000000000000000000000000000000000000000..af019b707dab0d8d3ee8ac1c6ee8a2d6482b007c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hp.F90 @@ -0,0 +1,68 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-hp.F90 - calls to service routine with HP-UX Fortran Compiler +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + integer(4) status + status = 1 + call exit(status) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + character(len = 32) r + + call hostnm(r) + rechner = r +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + real, external :: walltime + + sekunden = walltime(0.0) + +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + character(len = *) :: arg + character(100) :: a + + call getarg(iarg, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + ipxfargc = iargc() +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .false. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-ibm.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-ibm.F90 new file mode 100644 index 0000000000000000000000000000000000000000..288e22355b448f65111abaada245ca0aadd7fc10 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-ibm.F90 @@ -0,0 +1,70 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-ibm.F90 - calls to service routines on IBM +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + integer(4) status + status = 1 + call exit_(status) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + character(len = 32) r + + call hostnm_(r) + rechner = r +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + real(8) rtc + + sekunden = rtc() !!!mclock() * 0.01 +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + integer(4) :: i + character(len = *) :: arg + character(len(arg) + 1) :: a + + i = iarg + call getarg(i, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + integer(4) :: iargc + ipxfargc = iargc() +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .true. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-intel.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-intel.F90 new file mode 100644 index 0000000000000000000000000000000000000000..f60c35f249022f6d631797114418584d0247066a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-intel.F90 @@ -0,0 +1,65 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-intel.F90 - calls to service routine with Intel Fortran Compiler +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + integer(4) status + status = 1 + call exit(status) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + character(len = 32) r + + call hostnm(r) + rechner = r +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + call cpu_time(sekunden) +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + character(len = *) :: arg + character(100) :: a + + call getarg(iarg, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + ipxfargc = iargc() +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .false. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-nec.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-nec.F90 new file mode 100644 index 0000000000000000000000000000000000000000..db903a6691780fd249e828b6a12e7b7e6b176e68 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-nec.F90 @@ -0,0 +1,65 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-nec.F90 - calls service routines on NEC SX-8 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + integer(4) status + status = 1 + call exit(status) +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + call cpu_time(sekunden) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + character(len = 32) r + + call hostnm(r) + rechner = r +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + character(len = *) :: arg + character(len(arg) + 1) :: a + + call getarg(iarg, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + ipxfargc = iargc() +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .true. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-sun.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-sun.F90 new file mode 100644 index 0000000000000000000000000000000000000000..017d07bdd74355e3d0d1e386342c92a7f936c44c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-sun.F90 @@ -0,0 +1,64 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-sun.F90 - calls to service routines on SUN +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + call exit(1) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + + i = hostnm(rechner) +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + real time(2) + + call etime(time) + sekunden = time(1) +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + character(len = *) :: arg + character(len(arg) + 1) :: a + + call getarg(iarg, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + ipxfargc = iargc() +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .false. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/polyakov_loop.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/polyakov_loop.F90 new file mode 100644 index 0000000000000000000000000000000000000000..015be3504883e1cfd546382078bed8385143badf --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/polyakov_loop.F90 @@ -0,0 +1,98 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2000-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! polyakov_loop.F90 - in gamma_4-direction, requires (NPE(gamma_4) == 1) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine polyakov_loop(conf, traj, i_ensemble1, i_ensemble2) + + use typedef_hmc + use module_decomp + use module_function_decl + use module_vol + implicit none + + integer, intent(in) :: traj, i_ensemble1, i_ensemble2 + type(hmc_conf), intent(in) :: conf + + COMPLEX :: pl + REAL :: re_pl, im_pl + integer :: x, y, z, t, i, eo, j(DIM) + integer :: nx, ny, nz, nt, dir4, npe4 + integer, external :: ieo, e_o, std_xyzt2i + SU3 :: u + + SU3, parameter :: su3_one = reshape( & + (/ ONE,ZERO,ZERO, & + ZERO,ONE,ZERO, & + ZERO,ZERO,ONE /), & + (/ NCOL, NCOL /)) + + character(len=*), parameter :: key_pl = "%pl" + integer, save :: count = 0 + + + count = count + 1 + + dir4 = decomp%direction(4) + npe4 = decomp%act%npe(dir4) + + if (npe4 /= 1) then + call die("polyakov_loop(): gamma_4 direction must not be decomposed") + endif + + nx = decomp%std%N(1) + ny = decomp%std%N(2) + nz = decomp%std%N(3) + nt = decomp%std%N(4) + + pl = 0 + !$omp parallel do reduction(+: pl) private(x, y, z, t, i, j, eo, u) + do x = 0, nx - 1 + do y = 0, ny - 1 + do z = 0, nz - 1 + + u = su3_one + do t = 0, nt - 1 + j = (/x, y, z, t/) + + i = std_xyzt2i(j) + eo = e_o(j) + + call u_update2(u, conf%u(1, 1, i, eo, 4)) + enddo + + pl = pl + u(1,1) + u(2,2) + u(3,3) + enddo + enddo + enddo + + call global_sum_vec(SIZE_COMPLEX, pl) + + pl = pl / (THREE * decomp%std%L(1) * decomp%std%L(2) * decomp%std%L(3)) + + re_pl = Re(pl) + im_pl = Im(pl) + + if (my_pe() == 0) then + if (count == 1) write(UREC, 400) & + "T", key_pl, "traj", "e", "f", "Re(Polyakov_Loop)", "Im(Polyakov_Loop)" + + write(UREC, 410) key_pl, traj, i_ensemble1, i_ensemble2, re_pl, im_pl + endif + + +400 format (1x, 2a, a6, 2a3, 2a20) +410 format (1x, a4, i6, 2i3, 2g20.10) + +end diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/ran.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/ran.F90 new file mode 100644 index 0000000000000000000000000000000000000000..85b241f3f38cfde13470a63c84a7d5ad4d365683 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/ran.F90 @@ -0,0 +1,207 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! ran.F90 - random number related routines +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine ran_gauss_volh(m, ran, var, eo) + +! intitializes x with Gaussian random numbers in a manner that makes +! results independent of the lattice decomposition + + use module_decomp + use module_function_decl + use module_vol + implicit none + + integer, intent(in) :: m + COMPLEX, intent(out) :: ran(m, volh) + REAL, intent(in) :: var + integer, intent(in) :: eo + + integer, dimension (DIM) :: i_pe, block + integer :: i, ii, x, y, z, t, j + integer :: nxh, nx, ny, nz, nt, lx, ly, lz, npe(DIM) + REAL :: twovar, phi, r + + twovar = TWO * var + + i_pe = decomp%std%i_pe + npe = decomp%std%NPE + + nxh = decomp%std%NH(1) + + nx = decomp%std%N(1) + ny = decomp%std%N(2) + nz = decomp%std%N(3) + nt = decomp%std%N(4) + + lx = decomp%std%L(1) + ly = decomp%std%L(2) + lz = decomp%std%L(3) + + ! block() contains the number of random numbers to be skipped. + ! In the calculation of block() a factor 1/2 from "even/odd volume" + ! cancels with 2 from "two random numbers per site". + + block(1) = m * nx + block(2) = m * lx * ny + block(3) = m * lx * ly * nz + block(4) = m * lx * ly * lz * nt + + i = 0 + call ranskip(block(4) * i_pe(4)) + do t = 0, nt - 1 + call ranskip(block(3) * i_pe(3)) + do z = 0, nz - 1 + call ranskip(block(2) * i_pe(2)) + do y = 0, ny - 1 + call ranskip(block(1) * i_pe(1)) + do x = 0, nxh - 1 + i = i + 1 + ii = decomp%act%i(i, eo) + do j = 1, m + r = sqrt(-twovar * log(ranf())) + phi = TWOPI * ranf() + ran(j, ii) = cmplx(r * cos(phi), r * sin(phi)) + enddo + enddo + call ranskip(block(1) * (npe(1) - 1 - i_pe(1))) + enddo + call ranskip(block(2) * (npe(2) - 1 - i_pe(2))) + enddo + call ranskip(block(3) * (npe(3) - 1 - i_pe(3))) + enddo + call ranskip(block(4) * (npe(4) - 1 - i_pe(4))) + + +CONTAINS + + subroutine ranskip(n) + integer :: n + SEED :: seed, n_skip + + n_skip = n + call ranget(seed) + call ranset(seed, n_skip) + end subroutine ranskip + +end + +!------------------------------------------------------------------------------- +subroutine rancheck() ! checks if seed is equal on all PEs + + use module_function_decl + implicit none + SEED seed + + call ranget(seed) + call seed_compare(seed) + +end + +!------------------------------------------------------------------------------- +subroutine write_ran() ! save state of random number generator + + use module_function_decl + implicit none + SEED :: seed + FILENAME, external :: ran_file + + call rancheck() + call ranget(seed) + if (my_pe() == 0) then + open(URAN, file = ran_file(), action = "write") + write(URAN, *) seed + close(URAN) + endif + +end + +!------------------------------------------------------------------------------- +subroutine random_sequence(r, n) ! random permutation of [1..n] + + use module_function_decl + implicit none + integer, intent(in) :: n + integer, intent(out) :: r(n) + integer :: ran, seq(n), i, j, len_seq + integer :: ceiling + + len_seq = n + + do i = 1, len_seq + seq(i) = i + enddo + + do i = 1, n - 1 + ran = ceiling(len_seq * ranf()) + if (ran <= 0 .or. ran > len_seq) stop "random_sequence(): ran out of range" + r(i) = seq(ran) + do j = ran, len_seq - 1 + seq(j) = seq(j + 1) + enddo + len_seq = len_seq - 1 + enddo + r(n) = seq(1) + +end + +!------------------------------------------------------------------------------- +subroutine get_a_random_seed(seed) ! (try to) generate a random seed + + use module_function_decl + implicit none + SEED :: seed + integer :: count, rate, rate_10sec + integer :: pe, ierror + + if (my_pe() == 0) then + call system_clock(count = count, count_rate = rate) + + if (rate <= 0) call die("get_a_random_seed(): failed") + + rate_10sec = rate * 10 + seed = mod(count, rate_10sec) + endif + + call seed_broadcast(seed) +end + +!------------------------------------------------------------------------------- +subroutine init_ran(para, flags) ! initilize random number generator + + use typedef_para + use typedef_flags + implicit none + + type(type_para) :: para + type(type_flags) :: flags + FILENAME, external :: ran_file + SEED :: seed, null + + if (flags%continuation_job) then + open(URAN, file = ran_file(), action = "read", status = "old") + read(URAN, *) para%seed + close(URAN) + else + seed = para%seed + if (seed < 0) call get_a_random_seed(para%seed) + endif + + null = 0 + call ranset(para%seed, null) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/ranf.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/ranf.F90 new file mode 100644 index 0000000000000000000000000000000000000000..3b3a8216d3642bd9c558a910298b587d15f377bd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/ranf.F90 @@ -0,0 +1,112 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! ranf.F90 - Fortran90 implementation of the Cray random number generator ranf() +! +!------------------------------------------------------------------------------- +module module_ranf + + integer, parameter :: ikind = 8 + integer, parameter :: jkind = 8 + integer, parameter :: rkind = 8 + + integer(ikind), parameter :: mH = 2651554_ikind + integer(ikind), parameter :: mL = 15184245_ikind + integer(ikind), parameter :: mask24 = 16777215_ikind + integer(ikind), parameter :: mask48 = 281474976710655_ikind + integer(ikind), parameter :: default_seed = 48131768981101_ikind + integer(ikind), save :: seed = default_seed + +! integer(ikind), save :: mH +! integer(ikind), save :: mL +! integer(ikind), save :: mask24 +! integer(ikind), save :: mask48 +! integer(ikind), save :: default_seed +! integer(ikind), save :: seed +! +! data mH /o"12072642"/ +! data mL /o"71730565"/ +! data mask24 /o"77777777"/ +! data mask48 /o"7777777777777777"/ +! data default_seed /o"1274321477413155"/ +! data seed /o"1274321477413155"/ + +end + +!------------------------------------------------------------------------------- +function ranf() + + use module_ranf + implicit none + real(rkind) :: ranf + integer(ikind) :: seedH, seedL + + ! seed = mod(m * seed, 48): + + seedH = iand(mask24, ishft(seed, -24_jkind)) + seedL = iand(mask24, seed) + seed = iand(mask48, seedL * mL + ishft(seedL * mH + seedH * mL, 24_jkind)) + + ! normalize result: + + ranf = real(seed, kind = rkind) + ranf = set_exponent(fraction(ranf), exponent(ranf) - 48_jkind) + +end + +!------------------------------------------------------------------------------- +subroutine ranget(seed_out) + + use module_ranf + implicit none + integer(ikind), intent(out) :: seed_out + + seed_out = seed + +end + +!------------------------------------------------------------------------------- +subroutine ranset(seed_in, n_skip) + + use module_ranf + implicit none + integer(ikind), intent(in) :: seed_in, n_skip + integer(ikind) :: n, mm, mmH, mmL, seedH, seedL + + if (seed_in == 0_ikind) then + seed = default_seed + else + seed = iand(mask48, ibset(seed_in, 0_jkind)) + endif + + ! skip "n_skip" seeds [i.e. calculate seed = mod(m**n_skip * seed, 48)]: + + n = iand(mask48, n_skip) + mmH = mH + mmL = mL + + do while (n > 0_ikind) + + if (btest(n, 0_ikind)) then ! seed = seed * mm + seedH = iand(mask24, ishft(seed, -24_jkind)) + seedL = iand(mask24, seed) + seed = iand(mask48, seedL*mmL + ishft(seedL*mmH + seedH*mmL, 24_jkind)) + endif + + mm = iand(mask48, mmL * mmL + ishft(mmH * mmL, 25_jkind)) ! mm = mm * mm + mmH = iand(mask24, ishft(mm, -24_jkind)) + mmL = iand(mask24, mm) + + n = ishft(n, -1_jkind) + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/ranf_test.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/ranf_test.F90 new file mode 100644 index 0000000000000000000000000000000000000000..e87ba91aa5faa7fd9580607010cb9f0574de3b11 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/ranf_test.F90 @@ -0,0 +1,82 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! ranf_test - test of "ranf.F90" +! +!------------------------------------------------------------------------------- + +program ranf_test + + implicit none + integer(8), parameter :: null = 0 + integer(8) :: i, seed + real(8) :: x, ranf + + write(6,400) "default seed:" + + do i = 1, 10 + call ranget(seed) + x = ranf() + write(6,410) i, seed, x + enddo + + write(6,400) "seed = 4711:" + + call ranset(4711_8, null) + + do i = 1, 10 + call ranget(seed) + x = ranf() + write(6,410) i, seed, x + enddo + + write(6,400) "seed varies, no skip:" + + do i = -10, 20 + call ranset(i, null) + call ranget(seed) + x = ranf() + write(6,410) i, seed, x + enddo + + write(6,400) "default seed, skip varies:" + + do i = -10, 20 + call ranset(null, i) + call ranget(seed) + x = ranf() + write(6,410) i, seed, x + enddo + + write(6,400) "large seeds, no skip:" + + do i = 0, 47 + call ranset(2_8**i, null) + call ranget(seed) + x = ranf() + write(6,410) i, seed, x + enddo + + write(6,400) "default seeds, big skips:" + + do i = 0, 47 + call ranset(null, 2_8**i) + call ranget(seed) + x = ranf() + write(6,410) i, seed, x + enddo + + +400 format (//1x,a//) +410 format (i6,i24,f24.16) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/ranf_test.reference b/qcd/part_cpu/applications/QCD/src/kernel_A/ranf_test.reference new file mode 100644 index 0000000000000000000000000000000000000000..cd9731dca6ac5f2b10293d5a2749ab1d7b2b12ab --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/ranf_test.reference @@ -0,0 +1,208 @@ + + + default seed: + + + 1 48131768981101 0.5801136485795872 + 2 163287475723473 0.9505127349807658 + 3 267545549941893 0.7863714253306036 + 4 221343878630857 0.2976202640037293 + 5 83772656879069 0.4536999002984921 + 6 127705168870145 0.0062619416061871 + 7 1762579867765 0.2757364263838760 + 8 77612904194681 0.3056509438704786 + 9 86033092307533 0.6891007107498730 + 10 193964606509617 0.3826622386562981 + + + seed = 4711: + + + 1 4711 0.5499394951912784 + 2 154794206601235 0.1941472565046602 + 3 54647594503087 0.6685086396955562 + 4 188168453789179 0.7925054533613682 + 5 223070454027959 0.8427178850184980 + 6 237203997059235 0.0340008174584874 + 7 9570379302271 0.6038999714237825 + 8 169982730392075 0.9377837430456815 + 9 263962657233415 0.8757572329452152 + 10 246503746747443 0.5980755473195210 + + + seed varies, no skip: + + + -10 281474976710647 0.5775951060376308 + -9 281474976710647 0.5775951060376308 + -8 281474976710649 0.8936850824737128 + -7 281474976710649 0.8936850824737128 + -6 281474976710651 0.2097750589097949 + -5 281474976710651 0.2097750589097949 + -4 281474976710653 0.5258650353458769 + -3 281474976710653 0.5258650353458769 + -2 281474976710655 0.8419550117819590 + -1 281474976710655 0.8419550117819590 + 0 48131768981101 0.5801136485795872 + 1 1 0.1580449882180410 + 2 3 0.4741349646541231 + 3 3 0.4741349646541231 + 4 5 0.7902249410902051 + 5 5 0.7902249410902051 + 6 7 0.1063149175262872 + 7 7 0.1063149175262872 + 8 9 0.4224048939623692 + 9 9 0.4224048939623692 + 10 11 0.7384948703984513 + 11 11 0.7384948703984513 + 12 13 0.0545848468345334 + 13 13 0.0545848468345334 + 14 15 0.3706748232706154 + 15 15 0.3706748232706154 + 16 17 0.6867647997066975 + 17 17 0.6867647997066975 + 18 19 0.0028547761427795 + 19 19 0.0028547761427795 + 20 21 0.3189447525788616 + + + default seed, skip varies: + + + -10 65394301920949 0.4024069778585933 + -9 113267494720953 0.0411126961321777 + -8 11572195186317 0.3413527205224831 + -7 96082249059185 0.7768798964931598 + -6 218672250772389 0.7323797526691216 + -5 206146573825897 0.4161318802907665 + -4 117130711313405 0.8580567658715417 + -3 241521508190113 0.9085015119759028 + -2 255720441925013 0.9242771334193982 + -1 260160884603417 0.1709983940440232 + 0 48131768981101 0.5801136485795872 + 1 163287475723473 0.9505127349807658 + 2 267545549941893 0.7863714253306036 + 3 221343878630857 0.2976202640037293 + 4 83772656879069 0.4536999002984921 + 5 127705168870145 0.0062619416061871 + 6 1762579867765 0.2757364263838760 + 7 77612904194681 0.3056509438704786 + 8 86033092307533 0.6891007107498730 + 9 193964606509617 0.3826622386562981 + 10 107709844713829 0.1329027054963809 + 11 37408785934377 0.8318579032090732 + 12 234147183932349 0.5829797958307417 + 13 164094224454241 0.0986253383374169 + 14 27760564811605 0.2765484551335682 + 15 77841469968089 0.6204460277969481 + 16 174640031224365 0.0835029668338088 + 17 23503995644817 0.9903771205956851 + 18 278766376954437 0.9793469434430655 + 19 275661658097289 0.6938844384181841 + 20 195311106143645 0.9344770142467986 + + + large seeds, no skip: + + + 0 1 0.1580449882180410 + 1 3 0.4741349646541231 + 2 5 0.7902249410902051 + 3 9 0.4224048939623692 + 4 17 0.6867647997066975 + 5 33 0.2154846111953539 + 6 65 0.2729242341726668 + 7 129 0.3878034801272925 + 8 257 0.6175619720365439 + 9 513 0.0770789558550469 + 10 1025 0.9961129234920527 + 11 2049 0.8341808587660644 + 12 4097 0.5103167293140878 + 13 8193 0.8625884704101345 + 14 16385 0.5671319526022280 + 15 32769 0.9762189169864151 + 16 65537 0.7943928457547891 + 17 131073 0.4307407032915371 + 18 262145 0.7034364183650332 + 19 524289 0.2488278485120254 + 20 1048577 0.3396107088060099 + 21 2097153 0.5211764293939787 + 22 4194305 0.8843078705699163 + 23 8388609 0.6105707529217916 + 24 16777217 0.0630965176255422 + 25 33554433 0.9681480470330435 + 26 67108865 0.7782511058480459 + 27 134217729 0.3984572234780508 + 28 268435457 0.6388694587380606 + 29 536870913 0.1196939292580801 + 30 1073741825 0.0813428702981192 + 31 2147483649 0.0046407523781973 + 32 4294967297 0.8512365165383535 + 33 8589934593 0.5444280448586660 + 34 17179869185 0.9308111014992910 + 35 34359738369 0.7035772147805410 + 36 68719476737 0.2491094413430410 + 37 137438953473 0.3401738944680410 + 38 274877906945 0.5223028007180410 + 39 549755813889 0.8865606132180410 + 40 1099511627777 0.6150762382180410 + 41 2199023255553 0.0721074882180410 + 42 4398046511105 0.9861699882180410 + 43 8796093022209 0.8142949882180410 + 44 17592186044417 0.4705449882180410 + 45 35184372088833 0.7830449882180410 + 46 70368744177665 0.4080449882180410 + 47 140737488355329 0.6580449882180410 + + + default seeds, big skips: + + + 0 163287475723473 0.9505127349807658 + 1 267545549941893 0.7863714253306036 + 2 83772656879069 0.4536999002984921 + 3 86033092307533 0.6891007107498730 + 4 174640031224365 0.0835029668338088 + 5 164662163201517 0.9030257778452473 + 6 187456359032173 0.6504473584280426 + 7 247956895216749 0.1291985634769084 + 8 256822357742189 0.7782475093925463 + 9 15047453609581 0.7480693881996707 + 10 59815078594157 0.7411042261636034 + 11 36250749236845 0.3277774080605163 + 12 65623225914989 0.2811012722930322 + 13 17231226705517 0.8281668150128247 + 14 127417138878061 0.6480316574714529 + 15 67360884790893 0.5031963704648810 + 16 92173458085485 0.5752659087564247 + 17 158548977129069 0.1663654345582621 + 18 76826528323181 0.1364062830369370 + 19 181387589981805 0.2278551674942868 + 20 55158666827373 0.0162216864089864 + 21 150146494895725 0.0148297242383855 + 22 41054988277357 0.6995457998971837 + 23 33978207573613 0.8189779512147801 + 24 19824646166125 0.0578422538499730 + 25 272992500061805 0.5355708591203587 + 26 216378254431853 0.4910280696611302 + 27 103149763171949 0.4019424907426732 + 28 158167757362797 0.2237713329057591 + 29 268203745744493 0.8674290172319310 + 30 206800745797229 0.1547443858842747 + 31 83994745902701 0.7293751231889622 + 32 119857722824301 0.8786365977983372 + 33 191583676667501 0.1771595470170872 + 34 53560607643245 0.7742054454545872 + 35 58989446305389 0.9682972423295872 + 36 69847123629677 0.3564808360795872 + 37 91562478278253 0.1328480235795872 + 38 134993187575405 0.6855823985795872 + 39 221854606169709 0.7910511485795872 + 40 114102466647661 0.0019886485795872 + 41 180073164314221 0.4238636485795872 + 42 30539582936685 0.2676136485795872 + 43 12947396892269 0.9551136485795872 + 44 259238001514093 0.3301136485795872 + 45 188869257336429 0.0801136485795872 + 46 48131768981101 0.5801136485795872 + 47 48131768981101 0.5801136485795872 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/sc.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/sc.F90 new file mode 100644 index 0000000000000000000000000000000000000000..47f85b02af90ccee3a894027c591284f6499918e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/sc.F90 @@ -0,0 +1,278 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! sc.F90 - routines for the Spin-Colour field +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_sc_size + + integer :: sc_n_real ! number of real numbers of an sc-field + integer :: sc_n_complex ! number of complex numbers of an sc-field + +end + +!------------------------------------------------------------------------------- +subroutine init_module_sc_size() + + use module_sc_size + use module_vol + implicit none + + sc_n_complex = NDIRAC * NCOL * volh + sc_n_real = NDIRAC * NCOL * volh * SIZE_COMPLEX + +end + +!------------------------------------------------------------------------------- +subroutine sc_zero(out) + + use module_sc_size + implicit none + REAL, intent(out) :: out(*) + integer :: i + + TIMING_START(timing_bin_sc_zero) + + !$omp parallel do + do i = 1, sc_n_real + out(i) = ZERO + enddo + + TIMING_STOP(timing_bin_sc_zero) +end + +!------------------------------------------------------------------------------- +subroutine sc_copy(out, in) + + use module_sc_size + implicit none + REAL, intent(out) :: out(*) + REAL, intent(in) :: in(*) + integer :: i + + TIMING_START(timing_bin_sc_copy) + + !$omp parallel do + do i = 1, sc_n_real + out(i) = in(i) + enddo + + TIMING_STOP(timing_bin_sc_copy) +end + +!------------------------------------------------------------------------------- +subroutine sc_scale(inout, factor) + + use module_sc_size + implicit none + REAL, intent(inout) :: inout(*) + REAL, intent(in) :: factor + integer :: i + + TIMING_START(timing_bin_sc_scale) + + !$omp parallel do + do i = 1, sc_n_real + inout(i) = inout(i) * factor + enddo + + TIMING_STOP(timing_bin_sc_scale) +end + +!------------------------------------------------------------------------------- +subroutine sc_cax2(out, in1, a1, in2, a2) ! out = a1 * in1 + a2 * in2 + + use module_sc_size + implicit none + COMPLEX, intent(out) :: out(*) + COMPLEX, intent(in) :: in1(*), in2(*) + COMPLEX, intent(in) :: a1, a2 + integer :: i + + TIMING_START(timing_bin_sc_cax2) + + !$omp parallel do + do i = 1, sc_n_complex + out(i) = a1 * in1(i) + a2 * in2(i) + enddo + + TIMING_STOP(timing_bin_sc_cax2) +end + +!------------------------------------------------------------------------------- +subroutine sc_axpy(inout, in, a) ! inout = inout + a * in + + use module_sc_size + implicit none + REAL, intent(inout) :: inout(*) + REAL, intent(in) :: in(*) + REAL, intent(in) :: a + integer :: i + + TIMING_START(timing_bin_sc_axpy) + + !$omp parallel do + do i = 1, sc_n_real + inout(i) = inout(i) + a * in(i) + enddo + + TIMING_STOP(timing_bin_sc_axpy) +end + +!------------------------------------------------------------------------------- +subroutine sc_caxpy(inout, in, a) ! inout = inout + a * in + + use module_sc_size + implicit none + COMPLEX, intent(inout) :: inout(*) + COMPLEX, intent(in) :: in(*) + COMPLEX, intent(in) :: a + integer :: i + + TIMING_START(timing_bin_sc_caxpy) + + !$omp parallel do + do i = 1, sc_n_complex + inout(i) = inout(i) + a * in(i) + enddo + + TIMING_STOP(timing_bin_sc_caxpy) +end + +!------------------------------------------------------------------------------- +subroutine sc_caxpy2(inout, in1, a1, in2, a2) ! inout = inout + a1*in1 + a2*in2 + + use module_sc_size + implicit none + COMPLEX, intent(inout) :: inout(*) + COMPLEX, intent(in) :: in1(*), in2(*) + COMPLEX, intent(in) :: a1, a2 + integer :: i + + TIMING_START(timing_bin_sc_caxpy2) + + !$omp parallel do + do i = 1, sc_n_complex + inout(i) = inout(i) + a1 * in1(i) + a2 * in2(i) + enddo + + TIMING_STOP(timing_bin_sc_caxpy2) +end + +!------------------------------------------------------------------------------- +subroutine sc_xpby(inout, in, b) ! inout = b * inout + in + + use module_sc_size + implicit none + REAL, intent(inout) :: inout(*) + REAL, intent(in) :: in(*) + REAL, intent(in) :: b + integer :: i + + TIMING_START(timing_bin_sc_xpby) + + !$omp parallel do + do i = 1, sc_n_real + inout(i) = b * inout(i) + in(i) + enddo + + TIMING_STOP(timing_bin_sc_xpby) +end + +!------------------------------------------------------------------------------- +subroutine sc_axpby(inout, in, b, a) ! inout = b * inout + a * in + + use module_sc_size + implicit none + REAL, intent(inout) :: inout(*) + REAL, intent(in) :: in(*) + REAL, intent(in) :: b, a + integer :: i + + TIMING_START(timing_bin_sc_axpby) + + !$omp parallel do + do i = 1, sc_n_real + inout(i) = b * inout(i) + a * in(i) + enddo + + TIMING_STOP(timing_bin_sc_axpby) +end + +!------------------------------------------------------------------------------- +REAL function sc_norm2(in) ! Sum_i abs(in_i)**2 + + use module_sc_size + implicit none + REAL, intent(in) :: in(*) + REAL :: tmp + integer :: i + + TIMING_START(timing_bin_sc_norm2) + + tmp = ZERO + !$omp parallel do reduction(+: tmp) + do i = 1, sc_n_real + tmp = tmp + in(i)**2 + enddo + + sc_norm2 = tmp + + TIMING_STOP(timing_bin_sc_norm2) +end + +!------------------------------------------------------------------------------- +REAL function sc_dot(x, y) ! Sum_i [Re(x_i) * Re(y_i) + Im(x_i) * Im(y_i)] + + use module_sc_size + implicit none + REAL, intent(in) :: x(*), y(*) + REAL :: tmp + integer :: i + + TIMING_START(timing_bin_sc_dot) + + tmp = ZERO + !$omp parallel do reduction(+: tmp) + do i = 1, sc_n_real + tmp = tmp + x(i) * y(i) + enddo + + sc_dot = tmp + + TIMING_STOP(timing_bin_sc_dot) +end + +!------------------------------------------------------------------------------- +COMPLEX function sc_cdotc(x, y) ! Sum_i conjg(x_i) * y_i + + use module_sc_size + implicit none + COMPLEX, intent(in) :: x(*), y(*) + COMPLEX :: tmp + integer :: i + + TIMING_START(timing_bin_sc_cdotc) + + tmp = ZERO + !$omp parallel do reduction(+: tmp) + do i = 1, sc_n_complex + tmp = tmp + conjg(x(i)) * y(i) + enddo + + sc_cdotc = tmp + + TIMING_STOP(timing_bin_sc_cdotc) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/service.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/service.F90 new file mode 100644 index 0000000000000000000000000000000000000000..f7981db72b83e9a6be52318313ac9aca84379ac9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/service.F90 @@ -0,0 +1,70 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-ibm.F90 - calls to service routines on IBM +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + integer(4) status + status = 1 + call exit(status) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + character(len = 32) r + + call hostnm(r) + rechner = r +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + real(8) rtc + + sekunden = rtc() !!!mclock() * 0.01 +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + integer(4) :: i + character(len = *) :: arg + character(len(arg) + 1) :: a + + i = iarg + call getarg(i, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + integer(4) :: iargc + ipxfargc = iargc() +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .true. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/staple.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/staple.F90 new file mode 100644 index 0000000000000000000000000000000000000000..a06be5d95f528d6deda28562173fa1aa6f902bcc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/staple.F90 @@ -0,0 +1,67 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! staple.F90 - calculates sum of staples for one link +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine staple(uuu, u, i, e, mu) + + use module_nn + use module_vol + implicit none + + SU3, intent(out) :: uuu + GAUGE_FIELD, intent(in) :: u + integer, intent(in) :: i, e, mu + integer :: o, nu, j1, j2, j3, j4 + + o = EVEN + ODD - e + uuu = 0 + + do nu = 1, DIM + if (nu /= mu) then + + ! (j2,o) --<-- x nu + ! | | + ! v ^ ^ + ! | | | + ! (i,e) -->-- (j1,o) x--> mu + ! | | + ! ^ v + ! | | + ! (j3,o) --<-- (j4,e) + + + j1 = nn(i, e, mu, FWD) + j2 = nn(i, e, nu, FWD) + j3 = nn(i, e, nu, BWD) + j4 = nn(j3,o, mu, FWD) + + if (j4 /= nn(j1, o, nu, BWD)) call die('staple(): j4 inconsistent') + if (nn(j1, o, nu, FWD) /= nn(j2, o, mu, FWD)) & + call die('staple(): j12 inconsistent') + + call uuu_fwd(uuu, u(1, 1, j1, o, nu), & + u(1, 1, j2, o, mu), & + u(1, 1, i, e, nu)) + + call uuu_bwd(uuu, u(1, 1, j4, e, nu), & + u(1, 1, j3, o, mu), & + u(1, 1, j3, o, nu)) + + endif + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/su3.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/su3.F90 new file mode 100644 index 0000000000000000000000000000000000000000..ad5cc768abdbd2f60aecfa362596150e5a9df702 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/su3.F90 @@ -0,0 +1,477 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! su3.F90 - SU(3) routines +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine gen2u(u, h) ! u := exp(i lambda_j h_j) + +! adapted from: + +! Program qcdf90, module generator_algebra, version 4.0.0 + +! Copyright by Indranil Dasgupta, Andrea R. Levi, Vittorio Lubicz +! and Claudio Rebbi - Boston University - January 1996 +! This program may be freely copied and used as long as this notice +! is retained. + + implicit none + + GENERATOR, intent(in) :: h + SU3, intent(out) :: u + + REAL :: p, q, a, alpha, l1, l2, l3, l12, s, aux, c, d, cs1, cs2 + REAL :: a8, a12, a45, a67 + GENERATOR :: h2, hs, hk, hs2 + COMPLEX :: ck1, ck2, ck3, ck4 + SU3 :: ms, mk + integer :: i, j, k + + integer, parameter :: rkind = RKIND + COMPLEX, parameter :: iu = (ZERO, ONE) + REAL, parameter :: eps = 0.00000001_rkind + + REAL, parameter :: sqrt33 = SQRT3 / THREE + REAL, parameter :: twosqrt33 = TWO * sqrt33 + +! h2=.Sq.h, inlined: + + a8 = h(8)*SQRT33 + a12 = h(1)**2+h(2)**2 + a45 = h(4)**2+h(5)**2 + a67 = h(6)**2+h(7)**2 + h2(1) = 2*h(1)*a8+h(4)*h(6)+h(5)*h(7) + h2(2) = 2*h(2)*a8+h(5)*h(6)-h(4)*h(7) + h2(3) = 2*h(3)*a8+HALF*(a45-a67) + h2(4) = h(4)*(h(3)-a8)+h(1)*h(6)-h(2)*h(7) + h2(5) = h(5)*(h(3)-a8)+h(1)*h(7)+h(2)*h(6) + h2(6) = h(6)*(-h(3)-a8)+h(1)*h(4)+h(2)*h(5) + h2(7) = h(7)*(-h(3)-a8)+h(1)*h(5)-h(2)*h(4) + h2(8) = (h(3)**2-h(8)**2+a12-HALF*(a45+a67))*SQRT33 + +! q = .Tr.h, inlined: + + q = h(1)**2 + DO i = 2,8 + q = q+h(i)**2 + END DO + q = TWO*q + +! p = (h*h2)/THREE, inlined: + + p = h(1)*h2(1) + DO i = 2,8 + p = p+h(i)*h2(i) + END DO + p = TWO*p/THREE + + a = SQRT(TWO*q/THREE) + alpha = ACOS(FOUR*p/a**3)/THREE + IF(alpha <= PI/6) THEN + l1 = a*COS(alpha) + l2 = a*COS(alpha+2*PI/3) + ELSE + l2 = a*COS(alpha) + l1 = a*COS(alpha+2*PI/3) + ENDIF + l3 = -l1-l2 + + l12 = l1*l2 + s = -l1-2*l2 + + aux = (TWO*l3*l3+l12)*(l1-l2) + c = s*(l3*l3+TWO*l12)/aux + d = -THREE*s*l3/aux + +! hs = c*h+d*h2, and +! hk = h-hs, inlined: + + DO i = 1,8 + hs(i) = c*h(i)+d*h2(i) + hk(i) = h(i)-hs(i) + END DO + +! hs2 = .Sq.hs, inlined: + + a8 = hs(8)*SQRT33 + a12 = hs(1)**2+hs(2)**2 + a45 = hs(4)**2+hs(5)**2 + a67 = hs(6)**2+hs(7)**2 + hs2(1) = 2*hs(1)*a8+hs(4)*hs(6)+hs(5)*hs(7) + hs2(2) = 2*hs(2)*a8+hs(5)*hs(6)-hs(4)*hs(7) + hs2(3) = 2*hs(3)*a8+HALF*(a45-a67) + hs2(4) = hs(4)*(hs(3)-a8)+hs(1)*hs(6)-hs(2)*hs(7) + hs2(5) = hs(5)*(hs(3)-a8)+hs(1)*hs(7)+hs(2)*hs(6) + hs2(6) = hs(6)*(-hs(3)-a8)+hs(1)*hs(4)+hs(2)*hs(5) + hs2(7) = hs(7)*(-hs(3)-a8)+hs(1)*hs(5)-hs(2)*hs(4) + hs2(8) = (hs(3)**2-hs(8)**2+a12-HALF*(a45+a67))*SQRT33 + + IF(ABS(s) > eps) THEN + cs1 = SIN(s)/s + cs2 = (COS(s)-1)/s**2 + ELSE + cs1 = 1 + cs2 = -HALF + ENDIF + + ck1 = EXP(IU*l3) + ck2 = 1/ck1**2 + ck3 = (ck2+2*ck1)/3 + IF(ABS(l3) > eps) THEN + ck4 = (ck1-ck2)/(3*l3) + ELSE + ck4 = 3*IU + ENDIF + +! aux = .Tr.hs, inlined: + + aux = hs(1)**2 + DO i = 2,8 + aux = aux+hs(i)**2 + END DO + aux = TWO*aux + +! ms = UNIT+IU*cs1*(.Matrix.hs.)+cs2*(.Matrix.hs)*(.Matrix.hs), inlined: + + ms(1,1) = ONE+cs2*aux/THREE & + +CMPLX(cs2*(hs2(3)+SQRT33*hs2(8)),cs1*(hs(3)+SQRT33*hs(8)),RKIND) + ms(2,2) = ONE+cs2*aux/THREE & + +CMPLX(cs2*(-hs2(3)+SQRT33*hs2(8)),cs1*(-hs(3)+SQRT33*hs(8)),RKIND) + ms(3,3) = ONE+cs2*aux/THREE & + +CMPLX(-cs2*TWOSQRT33*hs2(8),-cs1*TWOSQRT33*hs(8),RKIND) + ms(1,2) = CMPLX(cs2*hs2(1)+cs1*hs(2),cs1*hs(1)-cs2*hs2(2),RKIND) + ms(2,1) = CMPLX(cs2*hs2(1)-cs1*hs(2),cs1*hs(1)+cs2*hs2(2),RKIND) + ms(1,3) = CMPLX(cs2*hs2(4)+cs1*hs(5),cs1*hs(4)-cs2*hs2(5),RKIND) + ms(3,1) = CMPLX(cs2*hs2(4)-cs1*hs(5),cs1*hs(4)+cs2*hs2(5),RKIND) + ms(2,3) = CMPLX(cs2*hs2(6)+cs1*hs(7),cs1*hs(6)-cs2*hs2(7),RKIND) + ms(3,2) = CMPLX(cs2*hs2(6)-cs1*hs(7),cs1*hs(6)+cs2*hs2(7),RKIND) + +! mk = ck3*UNIT+ck4*(.Matrix.hk), inlined: + + mk(1,1) = ck3+ck4*(hk(3)+SQRT33*hk(8)) + mk(2,2) = ck3+ck4*(-hk(3)+SQRT33*hk(8)) + mk(3,3) = ck3-ck4*TWOSQRT33*hk(8) + mk(1,2) = ck4*CMPLX(hk(1),-hk(2),RKIND) + mk(2,1) = ck4*CMPLX(hk(1),hk(2),RKIND) + mk(1,3) = ck4*CMPLX(hk(4),-hk(5),RKIND) + mk(3,1) = ck4*CMPLX(hk(4),hk(5),RKIND) + mk(2,3) = ck4*CMPLX(hk(6),-hk(7),RKIND) + mk(3,2) = ck4*CMPLX(hk(6),hk(7),RKIND) + +! u = ms*mk, inlined: + + DO i = 1,3 + DO j = 1,3 + u(i,j) = ms(i,1)*mk(1,j) + DO k = 2,3 + u(i,j) = u(i,j)+ms(i,k)*mk(k,j) + END DO + END DO + END DO + +END + +!------------------------------------------------------------------------------- +subroutine im_tr_j(p, u, s) ! p(j) := p(j) + s * Im Tr(lambda_j U) + + implicit none + + GENERATOR :: p + SU3 :: u + REAL :: s + + p(1) = p(1) + s * (Im(u(1, 2)) + Im(u(2, 1))) + p(2) = p(2) + s * (Re(u(1, 2)) - Re(u(2, 1))) + p(3) = p(3) + s * (Im(u(1, 1)) - Im(u(2, 2))) + p(4) = p(4) + s * (Im(u(1, 3)) + Im(u(3, 1))) + p(5) = p(5) + s * (Re(u(1, 3)) - Re(u(3, 1))) + p(6) = p(6) + s * (Im(u(2, 3)) + Im(u(3, 2))) + p(7) = p(7) + s * (Re(u(2, 3)) - Re(u(3, 2))) + p(8) = p(8) + s * (Im(u(1, 1)) + Im(u(2, 2)) - TWO * Im(u(3, 3))) / SQRT3 + +end + +!------------------------------------------------------------------------------- +subroutine re_tr_j(p, u, s) ! p(j) := p(j) + s * Re Tr(lambda_j U) + + implicit none + + GENERATOR :: p + SU3 :: u + REAL :: s + + p(1) = p(1) + s * (Re(u(1, 2)) + Re(u(2, 1))) + p(2) = p(2) + s * (Im(u(2, 1)) - Im(u(1, 2))) + p(3) = p(3) + s * (Re(u(1, 1)) - Re(u(2, 2))) + p(4) = p(4) + s * (Re(u(1, 3)) + Re(u(3, 1))) + p(5) = p(5) + s * (Im(u(3, 1)) - Im(u(1, 3))) + p(6) = p(6) + s * (Re(u(2, 3)) + Re(u(3, 2))) + p(7) = p(7) + s * (Im(u(3, 2)) - Im(u(2, 3))) + p(8) = p(8) + s * (Re(u(1, 1)) + Re(u(2, 2)) - TWO * Re(u(3, 3))) / SQRT3 + +end + +!------------------------------------------------------------------------------- +subroutine su3_check(u) ! checks if "u" is in SU(3) + + implicit none + SU3 :: u, v + SU3, parameter :: su3_one = reshape( & + (/ ONE,ZERO,ZERO, & + ZERO,ONE,ZERO, & + ZERO,ZERO,ONE /), & + (/ NCOL, NCOL /)) + REAL, parameter :: eps = 1e-13 + REAL :: dev + integer :: i, j + + call uud(v, u, u) + dev = ZERO + do i = 1, NCOL + do j = 1, NCOL + dev = dev + abs(Re(v(i, j)) - Re(su3_one(i, j))) & + + abs(Im(v(i, j)) - Im(su3_one(i, j))) + enddo + enddo + + if (dev > eps) call die('su3_check(): dev > eps') + + call su3_check_det(u) + +end + +!------------------------------------------------------------------------------- +subroutine su3_check_det(u) + + implicit none + COMPLEX :: det + SU3 :: u + REAL, parameter :: eps = 1e-13 + + det = u(1,1) * u(2,2) * u(3,3) & + + u(1,2) * u(2,3) * u(3,1) & + + u(1,3) * u(2,1) * u(3,2) & + - u(1,1) * u(2,3) * u(3,2) & + - u(1,2) * u(2,1) * u(3,3) & + - u(1,3) * u(2,2) * u(3,1) + + if (abs(Re(det) - ONE) > eps) call die("check_su3_det(): Re(det) /= 1") + if (abs(Im(det)) > eps) call die("check_su3_det(): Im(det) /= 0") + +end + +!------------------------------------------------------------------------------- +subroutine u_add(u, v) ! u := u + v + + implicit none + + SU3 :: u, v + integer i, j + + do j = 1, NCOL + do i = 1, NCOL + u(i, j) = u(i, j) + v(i, j) + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine u_complete(u) ! calculate 3rd column form the first two + + implicit none + SU3 :: u + + u(1,3) = conjg(u(2,1) * u(3,2) - u(3,1) * u(2,2)) + u(2,3) = conjg(u(3,1) * u(1,2) - u(1,1) * u(3,2)) + u(3,3) = conjg(u(1,1) * u(2,2) - u(2,1) * u(1,2)) + +end + +!------------------------------------------------------------------------------- +subroutine u_normalize(u) + +! from qcdsf_t3e program: +! +! u_normalize() takes a complex matrix and produces a true su3 matrix from +! the upper 6 entries (because of FORTRAN the first two rows -> DIFFERS +! FROM APE-PROGRAM!!) +! +! ( * * . ) +! ( * * . ) +! ( * * . ) (right 3 completely ignored) +! +! Normalization done by Gramm-Schmitt + + implicit none + SU3 :: u + COMPLEX :: f + REAL :: len + integer :: i + + len = real(u(1,1))**2 + aimag(u(1,1))**2 + & ! length u_1 + real(u(2,1))**2 + aimag(u(2,1))**2 + & + real(u(3,1))**2 + aimag(u(3,1))**2 + len = sqrt(len) + + do i = 1, NCOL + u(i,1) = u(i,1) / len ! normalize u_1 + enddo + + f = u(1,2) * conjg(u(1,1)) + & + u(2,2) * conjg(u(2,1)) + & + u(3,2) * conjg(u(3,1)) + + do i = 1, NCOL + u(i,2) = u(i,2) - f * u(i,1) ! orthogonalize + enddo + + len = real(u(1,2))**2 + aimag(u(1,2))**2 + & ! length u_2 + real(u(2,2))**2 + aimag(u(2,2))**2 + & + real(u(3,2))**2 + aimag(u(3,2))**2 + len = sqrt(len) + + do i = 1, NCOL + u(i,2) = u(i,2) / len ! normalize u_2 + enddo + + call u_complete(u) + +! u(1,3) = conjg(u(2,1) * u(3,2) - u(3,1) * u(2,2)) ! calculate u_3 +! u(2,3) = conjg(u(3,1) * u(1,2) - u(1,1) * u(3,2)) ! = u_1 x u_2 +! u(3,3) = conjg(u(1,1) * u(2,2) - u(2,1) * u(1,2)) + +end + +!------------------------------------------------------------------------------- +subroutine u_trans(u) ! u := transpose(u) + + implicit none + + SU3, intent(inout) :: u + COMPLEX :: tmp + integer :: i, j + + do j = 1, NCOL + do i = j + 1, NCOL + tmp = u(i, j) + u(i, j) = u(j, i) + u(j, i) = tmp + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine u_update(u, v) ! u = v * u + + implicit none + + SU3, intent(in) :: v + SU3, intent(inout) :: u + SU3 :: w + + w = u + call uu(u, v, w) + +end + +!------------------------------------------------------------------------------- +subroutine u_update2(u, v) ! u = u * v + + implicit none + + SU3, intent(in) :: v + SU3, intent(inout) :: u + SU3 :: w + + w = u + call uu(u, w, v) + +end + +!------------------------------------------------------------------------------- +subroutine uu(r, a, b) ! r = a * b + + implicit none + + SU3 :: r, a, b + integer :: i, j + + do i = 1, NCOL + do j = 1, NCOL + r(i, j) = a(i, 1) * b(1, j) & + + a(i, 2) * b(2, j) & + + a(i, 3) * b(3, j) + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine uud(r, a, b) ! U U^dagger: r = a * b+ + + implicit none + + SU3 :: r, a, b + integer :: i, j + + do i = 1, NCOL + do j = 1, NCOL + r(i, j) = a(i, 1) * conjg(b(j, 1)) & + + a(i, 2) * conjg(b(j, 2)) & + + a(i, 3) * conjg(b(j, 3)) + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine udu(r, a, b) ! U^dagger U: r = a+ * b + + implicit none + + SU3 :: r, a, b + integer :: i, j + + do i = 1, NCOL + do j = 1, NCOL + r(i, j) = conjg(a(1, i)) * b(1, j) & + + conjg(a(2, i)) * b(2, j) & + + conjg(a(3, i)) * b(3, j) + enddo + enddo + +end + +!------------------------------------------------------------------------------- +REAL function Re_Tr_uu(u, v) ! returns Re(Tr(u * v)) + + implicit none + SU3, intent(in) :: u, v + REAL :: p + integer :: c1, c2 + + p = 0 + do c2 = 1, NCOL + do c1 = 1, NCOL + p = p + Re(u(c2, c1)) * Re(v(c1, c2)) & + - Im(u(c2, c1)) * Im(v(c1, c2)) + enddo + enddo + + Re_Tr_uu = p + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/swap.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/swap.F90 new file mode 100644 index 0000000000000000000000000000000000000000..d4d924dc22f69551c01e5bd8c923a1d6df64e89f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/swap.F90 @@ -0,0 +1,90 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! swap.F90 - swap routines for various data types +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine swap_p_g_field(u, v) + + implicit none + P_GAUGE_FIELD :: u, v, tmp + + tmp => u + u => v + v => tmp + +end + +!------------------------------------------------------------------------------- +subroutine swap_p_sc_field(a, b) + + implicit none + P_SPINCOL_FIELD :: a, b, tmp + + tmp => a + a => b + b => tmp + +end + +!------------------------------------------------------------------------------- +subroutine swap_p_clover_field_a(x, y) + + use typedef_clover + implicit none + P_CLOVER_FIELD_A :: x, y, tmp + + tmp => x + x => y + y => tmp + +end + +!------------------------------------------------------------------------------- +subroutine swap_p_clover_field_b(x, y) + + use typedef_clover + implicit none + P_CLOVER_FIELD_B :: x, y, tmp + + tmp => x + x => y + y => tmp + +end + +!------------------------------------------------------------------------------- +subroutine swap_real(x, y) + + implicit none + REAL :: x, y, tmp + + tmp = x + x = y + y = tmp + +end + +!------------------------------------------------------------------------------- +subroutine swap_integer(x, y) + + implicit none + integer :: x, y, tmp + + tmp = x + x = y + y = tmp + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/test_echo.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/test_echo.F90 new file mode 100644 index 0000000000000000000000000000000000000000..4533c28887c30a2cf91018d0ae019770ae14e9ff --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/test_echo.F90 @@ -0,0 +1,9 @@ +program bqcd_echo + + character(2) :: arg + + do i = 1, ipxfargc() + call pxfgetarg(i, arg, length, istat) + write(6,*) i, ":", arg(1:length), ":" + enddo +end diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/timing.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/timing.F90 new file mode 100644 index 0000000000000000000000000000000000000000..7393ff71738c3b81f1153846dd19bdd8afa773c3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/timing.F90 @@ -0,0 +1,324 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! timing.F90 - measurements of execution times and performance +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_timing_meas + + implicit none + + type type_timing + SECONDS time + SECONDS total_time + integer n_call + integer fill_the_cache_line + end type type_timing + + integer, parameter :: n_timing = 55 + type(type_timing), save :: meas(n_timing) ! measurements +!dir$ cache_align meas + + data meas /n_timing * type_timing(0.0, 0.0, 0, 0)/ + +end + +!------------------------------------------------------------------------------- +module module_timing_write + + use module_timing_meas + implicit none + + + character(len = 16), save :: text(n_timing) ! region name + integer, save :: n_op(n_timing) ! # operations + + data text(timing_bin_d_xf) /"d_xf"/ + data text(timing_bin_d_xb) /"d_xb"/ + data text(timing_bin_d_yf) /"d_yf"/ + data text(timing_bin_d_yb) /"d_yb"/ + data text(timing_bin_d_zf) /"d_zf"/ + data text(timing_bin_d_zb) /"d_zb"/ + data text(timing_bin_d_t) /"d_t"/ + data text(timing_bin_d) /"D_TOTAL"/ + + data text(timing_bin_global_sum) /"global_sum"/ + data text(timing_bin_global_sum_vec) /"global_sum_vec"/ + data text(timing_bin_sc_zero) /"sc_zero"/ + data text(timing_bin_sc_copy) /"sc_copy"/ + data text(timing_bin_sc_scale) /"sc_scale"/ + data text(timing_bin_sc_axpy) /"sc_axpy"/ + data text(timing_bin_sc_caxpy) /"sc_caxpy"/ + data text(timing_bin_sc_caxpy2) /"sc_caxpy2"/ + data text(timing_bin_sc_cax2) /"sc_cax2"/ + data text(timing_bin_sc_xpby) /"sc_xpby"/ + data text(timing_bin_sc_axpby) /"sc_axpby"/ + data text(timing_bin_sc_norm2) /"sc_norm2"/ + data text(timing_bin_sc_dot) /"sc_dot"/ + data text(timing_bin_sc_cdotc) /"sc_cdotc"/ + + data text(timing_bin_plaq) /"plaquette"/ + data text(timing_bin_cooling) /"cooling"/ + data text(timing_bin_u_read) /"u_read"/ + data text(timing_bin_u_write) /"u_write"/ + + data text(timing_bin_total) /"TOTAL"/ + data text(timing_bin_hmc) /"HMC"/ + data text(timing_bin_cg) /"CG"/ + data text(timing_bin_mtdagmt) /"MTDAGMT"/ + + data text(timing_bin_dsf) /"dsf"/ + data text(timing_bin_dsg) /"dsg"/ + data text(timing_bin_hmc_u) /"hmc_u"/ + data text(timing_bin_hmc_init_p) /"hmc_init_p"/ + + data text(timing_bin_clover_init) /"clover_init"/ + data text(timing_bin_clover_mult_a) /"clover_mult_a"/ + data text(timing_bin_clover_mult_ao)/"clover_mult_ao"/ + data text(timing_bin_clover_mult_b) /"clover_mult_b"/ + data text(timing_bin_clover_dsd) /"clover_dsd"/ + data text(timing_bin_clover_dsf) /"clover_dsf"/ + + data text(timing_bin_hmc_init) /"hmc_init"/ + data text(timing_bin_hmc_momenta) /"hmc_momenta"/ + data text(timing_bin_hmc_init_phi) /"hmc_phi"/ + data text(timing_bin_hmc_h_old) /"hmc_h_old"/ + data text(timing_bin_hmc_backup) /"hmc_backup"/ + data text(timing_bin_hmc_half_step0)/"hmc_half_step0"/ + data text(timing_bin_hmc_half_step1)/"hmc_half_step1"/ + data text(timing_bin_hmc_xbound_g) /"hmc_xbound_g"/ + data text(timing_bin_hmc_steps) /"hmc_steps"/ + data text(timing_bin_hmc_h_new) /"hmc_h_new"/ + data text(timing_bin_hmc_rest) /"hmc_rest"/ + + data text(timing_bin_h_mult_a) /"h_mult_a"/ + data text(timing_bin_h_mult_b) /"h_mult_b"/ + data text(timing_bin_h_mult_c) /"h_mult_c"/ + + data text(timing_bin_sc2_projection)/"sc2_projection"/ + + integer, parameter :: op_add = 2 ! operations per complex add. + integer, parameter :: op_mult = 6 ! operations per complex mult. + + integer, parameter, private :: op_d_xyz = 18 * op_mult + 30 * op_add + integer, parameter, private :: op_d_t = 36 * op_mult + 24 * op_add + 24 + + integer, parameter :: op_d = 6 * op_d_xyz + op_d_t + + integer, parameter, private :: op_uuu = 162 * op_mult + integer, parameter, private :: op_re_tr = 36 + integer, parameter, private :: op_plaq = 2 * 6 * (op_uuu + op_re_tr) + + integer, parameter :: op_sc_r = NDIRAC * NCOL * SIZE_COMPLEX + integer, parameter :: op_sc_c = NDIRAC * NCOL + + integer, parameter :: op_blas_r1 = op_sc_r + integer, parameter :: op_blas_r2 = op_sc_r * 2 + integer, parameter :: op_blas_r3 = op_sc_r * 3 + integer, parameter :: op_blas_c1 = op_sc_c * op_mult + integer, parameter :: op_blas_c2 = op_sc_c * (op_mult + op_add) + integer, parameter :: op_blas_c3 = 2 * op_blas_c2 + + integer, parameter :: op_clov = 84 * op_mult + 60 * op_add + 24 + integer, parameter :: op_h_mult= NDIRAC * NCOL * 3 + + integer, parameter :: op_sc2_proj = 36 * op_add + + data n_op(timing_bin_d_xf) /op_d_xyz/ + data n_op(timing_bin_d_xb) /op_d_xyz/ + data n_op(timing_bin_d_yf) /op_d_xyz/ + data n_op(timing_bin_d_yb) /op_d_xyz/ + data n_op(timing_bin_d_zf) /op_d_xyz/ + data n_op(timing_bin_d_zb) /op_d_xyz/ + data n_op(timing_bin_d_t) /op_d_t/ + data n_op(timing_bin_d) /op_d/ + + data n_op(timing_bin_global_sum) /0/ + data n_op(timing_bin_global_sum_vec) /0/ + data n_op(timing_bin_sc_zero) /0/ + data n_op(timing_bin_sc_copy) /0/ + data n_op(timing_bin_sc_scale) /op_blas_r1/ + data n_op(timing_bin_sc_norm2) /op_blas_r2/ + data n_op(timing_bin_sc_dot) /op_blas_r2/ + data n_op(timing_bin_sc_axpy) /op_blas_r2/ + data n_op(timing_bin_sc_xpby) /op_blas_r2/ + data n_op(timing_bin_sc_axpby) /op_blas_r3/ + data n_op(timing_bin_sc_cdotc) /op_blas_c2/ + data n_op(timing_bin_sc_caxpy) /op_blas_c2/ + data n_op(timing_bin_sc_caxpy2) /op_blas_c3/ + data n_op(timing_bin_sc_cax2) /op_blas_c3/ + + data n_op(timing_bin_plaq) /op_plaq/ + data n_op(timing_bin_cooling) /0/ + data n_op(timing_bin_u_read) /0/ + data n_op(timing_bin_u_write) /0/ + + data n_op(timing_bin_total) /0/ + data n_op(timing_bin_hmc) /0/ + data n_op(timing_bin_cg) /0/ + data n_op(timing_bin_mtdagmt) /0/ + + data n_op(timing_bin_dsf) /0/ + data n_op(timing_bin_dsg) /0/ + data n_op(timing_bin_hmc_u) /0/ + + data n_op(timing_bin_clover_init) /0/ + data n_op(timing_bin_clover_mult_a) /op_clov/ + data n_op(timing_bin_clover_mult_ao)/op_clov/ + data n_op(timing_bin_clover_mult_b) /op_clov/ + data n_op(timing_bin_clover_dsd) /0/ + data n_op(timing_bin_clover_dsf) /0/ + + data n_op(timing_bin_hmc_init) /0/ + data n_op(timing_bin_hmc_momenta) /0/ + data n_op(timing_bin_hmc_init_phi) /0/ + data n_op(timing_bin_hmc_h_old) /0/ + data n_op(timing_bin_hmc_backup) /0/ + data n_op(timing_bin_hmc_half_step0)/0/ + data n_op(timing_bin_hmc_half_step1)/0/ + data n_op(timing_bin_hmc_xbound_g) /0/ + data n_op(timing_bin_hmc_steps) /0/ + data n_op(timing_bin_hmc_h_new) /0/ + data n_op(timing_bin_hmc_rest) /0/ + + data n_op(timing_bin_h_mult_a) /op_h_mult/ + data n_op(timing_bin_h_mult_b) /op_h_mult/ + data n_op(timing_bin_h_mult_c) /op_h_mult/ + data n_op(timing_bin_sc2_projection)/op_sc2_proj/ + +end + +!------------------------------------------------------------------------------- +subroutine timing_start(bin) + + use module_timing_meas + implicit none + integer bin + SECONDS sekunden + + meas(bin)%time = sekunden() +end + +!------------------------------------------------------------------------------- +subroutine timing_stop(bin) + + use module_timing_meas + implicit none + integer bin + SECONDS sekunden + + meas(bin)%total_time = meas(bin)%total_time + sekunden() - meas(bin)%time + meas(bin)%n_call = meas(bin)%n_call + 1 +end + +!------------------------------------------------------------------------------- +subroutine timing_write(unit) + + use module_timing_write + use module_cg + use module_function_decl + use module_switches + use module_thread + use module_vol + implicit none + + integer unit, i + integer ierror + integer op_mtdagmt + integer cg_calls, cg_iter + real mflops, mflops_mean, mflops_min, mflops_max + real total_gflops, time_mean + + + character(len = 8) :: a_mflops_mean, a_mflops_min, a_mflops_max, & + a_total_gflops, a_time_mean, a_n_call + + character(*), parameter :: ifmt = "(i8)", ffmt = "(f8.2)", & + tab_fmt ="(2(3(1x,a),2x),1x,a)" + + if (version_of_d() >= 2) then + n_op(timing_bin_d_xf) = n_op(timing_bin_d_xf) * 2 + n_op(timing_bin_d_yf) = n_op(timing_bin_d_yf) * 2 + n_op(timing_bin_d_zf) = n_op(timing_bin_d_zf) * 2 + endif + + op_mtdagmt = 2 * (2 * op_d + op_blas_r2) + if (switches%clover) op_mtdagmt = op_mtdagmt + 4 * op_clov + if (switches%h_ext) op_mtdagmt = op_mtdagmt + 4 * op_h_mult + + if (version_of_d() == 21 .or. version_of_d() == 22) then + n_op(timing_bin_d_xf) = n_op(timing_bin_d_xf) - 12 * op_add + n_op(timing_bin_d_yf) = n_op(timing_bin_d_yf) - 12 * op_add + n_op(timing_bin_d_zf) = n_op(timing_bin_d_zf) - 12 * op_add + endif + + cg_calls = meas(timing_bin_cg)%n_call + cg_iter = cg_iterations_total + + n_op(timing_bin_mtdagmt) = op_mtdagmt + n_op(timing_bin_cg) = cg_iter * (op_mtdagmt + 5 * op_blas_r2) & + + cg_calls * (op_mtdagmt + op_blas_r1) + n_op(timing_bin_cg) = nint(real(n_op(timing_bin_cg)) / real(cg_calls)) + + + call begin(unit, "Timing") + if (my_pe() == 0) then + write(unit,"(48x,a)") "Performance" + write(unit, tab_fmt) "region ", " #calls", " time", & + " mean", " min", " max", " Total" + write(unit, tab_fmt) " ", " ", " s", & + " Mflop/s", " Mflop/s", " Mflop/s", " Gflop/s" + write(unit, *) + endif + + do i = 1, n_timing + + write(a_n_call, ifmt) meas(i)%n_call + a_time_mean = " " + a_mflops_mean = " " + a_mflops_min = " " + a_mflops_max = " " + a_total_gflops = " " + + if (meas(i)%n_call /= 0) then ! must be true on all PEs !! + + time_mean = global_sum(real(meas(i)%total_time, kind=RKIND)) / num_pes() + write(a_time_mean, ffmt) time_mean + + if (n_op(i) /= 0) then + mflops = 1e-6 * n_op(i) * volh * meas(i)%n_call / meas(i)%total_time + mflops_mean = global_sum(real(mflops, kind=RKIND)) / num_pes() + + mflops_min = global_min(mflops) + mflops_max = global_max(mflops) + + total_gflops = 1e-3 * mflops_mean * num_pes() + + write(a_mflops_mean, ffmt) mflops_mean / n_thread + write(a_mflops_min, ffmt) mflops_min / n_thread + write(a_mflops_max, ffmt) mflops_max / n_thread + write(a_total_gflops, ffmt) total_gflops + endif + endif + + if (my_pe() == 0) then + write(unit, tab_fmt) text(i), a_n_call, a_time_mean, & + a_mflops_mean, a_mflops_min, a_mflops_max, & + a_total_gflops + endif + enddo + + call end_A(unit, "Timing") +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/traces.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/traces.F90 new file mode 100644 index 0000000000000000000000000000000000000000..7e05fcea473452153b5df5680c455eb8debf3dd5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/traces.F90 @@ -0,0 +1,166 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2000-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! traces.F90 +! +! calculates: Tr(inv(M)) psibar psi (pbp) +! Tr(gamma5 inv(M)) psibar gamma5 psi (p5p) +! Tr(inv(M+ M)) pion norm (pinorm) +! +! traces of a matrix A are calculated with a stochastic estimator: +! +! Tr(A) = eta+ A eta (eta: Gaussian noise) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine traces(para, conf, traj, i_ensemble1, i_ensemble2) + + use typedef_hmc + use module_function_decl + use module_p_interface + use module_vol + implicit none + + integer, intent(in) :: traj, i_ensemble1, i_ensemble2 + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + + P_SPINCOL_FIELD, save :: eta_e, eta_o, zeta_e, zeta_o + + character(len=*), parameter :: key_tr = "%tr" + integer, save :: count = 0 + integer :: n_sc_field, size_of_trace + integer :: cg_ncall, cg_niter_max, cg_niter_tot + REAL :: pinorm + REAL :: re_pbp, im_pbp, re_p5p, im_p5p + COMPLEX :: pbp, p5p + REAL :: res(5) + + + ALLOCATE_SC_FIELD(eta_e) + ALLOCATE_SC_FIELD(eta_o) + ALLOCATE_SC_FIELD(zeta_e) + ALLOCATE_SC_FIELD(zeta_o) + + count = count + 1 + n_sc_field = NDIRAC * NCOL * volh + size_of_trace = NDIRAC * NCOL * volume + + call ran_gauss_volh(NDIRAC * NCOL, eta_e, HALF, EVEN) + call ran_gauss_volh(NDIRAC * NCOL, eta_o, HALF, ODD) + + call init_cg_stat() + call solve(para, conf, zeta_e, zeta_o, eta_e, eta_o) ! zeta = inv(M) eta + + pbp = sc_cdotc(eta_e, zeta_e) + sc_cdotc(eta_o, zeta_o) + + pinorm = sc_dot(zeta_e, zeta_e) + sc_dot(zeta_o, zeta_o) + + call gamma5(zeta_e, volh) ! zeta = gamma5 inv(M) eta + call gamma5(zeta_o, volh) + + p5p = sc_cdotc(eta_e, zeta_e) + sc_cdotc(eta_o, zeta_o) + + res(1) = Re(pbp) + res(2) = Im(pbp) + res(3) = Re(p5p) + res(4) = Im(p5p) + res(5) = pinorm + + call global_sum_vec(5, res) + + re_pbp = res(1) / size_of_trace + im_pbp = res(2) / size_of_trace + re_p5p = res(3) / size_of_trace + im_p5p = res(4) / size_of_trace + pinorm = res(5) / size_of_trace + + call get_cg_stat(cg_ncall, cg_niter_max, cg_niter_tot) + + if (my_pe() == 0) then + if (count == 1) write(UREC, 400) & + "T", key_tr, "traj", "e", "f", & + "Re(pbp)", "Im(pbp)", "Re(p5p)", "-Im(p5p)", "PionNorm", "CGiter" + + write(UREC, 410) key_tr, traj, i_ensemble1, i_ensemble2, & + re_pbp, im_pbp, re_p5p, -im_p5p, pinorm, cg_niter_max + endif + + +400 format (1x, 2a, a6, 2a3, 5a20, a10) +410 format (1x, a4, i6, 2i3, 5g20.10, i10) + +end + +!------------------------------------------------------------------------------- +subroutine solve(para, conf, out_e, out_o, in_e, in_o) ! solves: M out = in + + use typedef_hmc + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + SPINCOL_FIELD, intent(out) :: out_e, out_o + SPINCOL_FIELD, intent(in) :: in_e, in_o + + REAL :: a, b + integer :: iterations + external :: mtdagmt + + b = para%kappa / (ONE + para%h**2) + + call h_mult_c(out_o, -para%h, in_o, volh) + + call d(EVEN, ODD, out_e, out_o, conf%u) + + call sc_xpby(out_e, in_e, b) + + call mtil_dag(out_o, out_e, para, conf) + + call cg(mtdagmt, out_e, out_o, para, conf, iterations) + + call d(ODD, EVEN, out_o, out_e, conf%u) + + a = ONE / (ONE + para%h**2) + + call sc_axpby(out_o, in_o, b, a) + + call h_mult_b(-para%h, out_o, volh) + +end + +!------------------------------------------------------------------------------- +subroutine gamma5(x, volh) + + implicit none + COMPLEX, dimension (NDIRAC, *) :: x + integer :: volh + + integer :: i + COMPLEX :: x1, x2, x3, x4 + + !$omp parallel do private(x1, x2, x3, x4) + do i = 1, NCOL * volh + x1 = x(1, i) + x2 = x(2, i) + x3 = x(3, i) + x4 = x(4, i) + + x(1, i) = x3 + x(2, i) = x4 + x(3, i) = x1 + x(4, i) = x2 + enddo +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/types.h b/qcd/part_cpu/applications/QCD/src/kernel_A/types.h new file mode 100644 index 0000000000000000000000000000000000000000..269565a5c9a46e4a9816db69b467a1b10a2c9d88 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/types.h @@ -0,0 +1,10 @@ +typedef int INTSTD; +typedef short INT4; +typedef long INT8; + +typedef double REALSTD; +typedef float REAL4; +typedef double REAL8; + +typedef struct { REAL4 r, i; } COMPLEX4; +typedef struct { REAL8 r, i; } COMPLEX8; diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_bwd.c b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_bwd.c new file mode 100644 index 0000000000000000000000000000000000000000..cdb2bd9ea4020e67a42a03b6aa677fd062ffdb4a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_bwd.c @@ -0,0 +1,353 @@ +/* +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! uuu_bwd.c - adds backward staple: r = r + a^\dagger b^\dagger c +! +!------------------------------------------------------------------------------- +*/ + +#include "types.h" + +#ifdef NamesToLower_ +# define UUU_BWD uuu_bwd_ +#endif + +#ifdef NamesToLower +# define UUU_BWD uuu_bwd +#endif + +void UUU_BWD(r, a, b, c) +COMPLEX8 *r, *a, *b, *c; +{ + register COMPLEX8 q__1, q__2; + register COMPLEX8 t1, t2, t3, x1, x2, x3; + + /* Parameter adjustments */ + c -= 4; + b -= 4; + a -= 4; + r -= 4; + + /* Function Body */ + q__1.r = a[4].r * b[4].r - a[4].i * b[4].i, + q__1.i = a[4].r * b[4].i + a[4].i * b[4].r; + t1.r = q__1.r, + t1.i = q__1.i; + q__1.r = a[4].r * b[5].r - a[4].i * b[5].i, + q__1.i = a[4].r * b[5].i + a[4].i * b[5].r; + t2.r = q__1.r, + t2.i = q__1.i; + q__1.r = a[4].r * b[6].r - a[4].i * b[6].i, + q__1.i = a[4].r * b[6].i + a[4].i * b[6].r; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[5].r * b[7].r - a[5].i * b[7].i, + q__2.i = a[5].r * b[7].i + a[5].i * b[7].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = a[5].r * b[8].r - a[5].i * b[8].i, + q__2.i = a[5].r * b[8].i + a[5].i * b[8].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = a[5].r * b[9].r - a[5].i * b[9].i, + q__2.i = a[5].r * b[9].i + a[5].i * b[9].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[6].r * b[10].r - a[6].i * b[10].i, + q__2.i = a[6].r * b[10].i + a[6].i * b[10].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = -q__1.i; + q__2.r = a[6].r * b[11].r - a[6].i * b[11].i, + q__2.i = a[6].r * b[11].i + a[6].i * b[11].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = -q__1.i; + q__2.r = a[6].r * b[12].r - a[6].i * b[12].i, + q__2.i = a[6].r * b[12].i + a[6].i * b[12].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = -q__1.i; + + q__2.r = t1.r * c[4].r - t1.i * c[4].i, + q__2.i = t1.r * c[4].i + t1.i * c[4].r; + q__1.r = r[4].r + q__2.r, + q__1.i = r[4].i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t1.r * c[7].r - t1.i * c[7].i, + q__2.i = t1.r * c[7].i + t1.i * c[7].r; + q__1.r = r[7].r + q__2.r, + q__1.i = r[7].i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t1.r * c[10].r - t1.i * c[10].i, + q__2.i = t1.r * c[10].i + t1.i * c[10].r; + q__1.r = r[10].r + q__2.r, + q__1.i = r[10].i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t2.r * c[5].r - t2.i * c[5].i, + q__2.i = t2.r * c[5].i + t2.i * c[5].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t2.r * c[8].r - t2.i * c[8].i, + q__2.i = t2.r * c[8].i + t2.i * c[8].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t2.r * c[11].r - t2.i * c[11].i, + q__2.i = t2.r * c[11].i + t2.i * c[11].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t3.r * c[6].r - t3.i * c[6].i, + q__2.i = t3.r * c[6].i + t3.i * c[6].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + r[4].r = q__1.r, + r[4].i = q__1.i; + q__2.r = t3.r * c[9].r - t3.i * c[9].i, + q__2.i = t3.r * c[9].i + t3.i * c[9].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + r[7].r = q__1.r, + r[7].i = q__1.i; + q__2.r = t3.r * c[12].r - t3.i * c[12].i, + q__2.i = t3.r * c[12].i + t3.i * c[12].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + r[10].r = q__1.r, + r[10].i = q__1.i; + + q__1.r = a[7].r * b[4].r - a[7].i * b[4].i, + q__1.i = a[7].r * b[4].i + a[7].i * b[4].r; + t1.r = q__1.r, + t1.i = q__1.i; + q__1.r = a[7].r * b[5].r - a[7].i * b[5].i, + q__1.i = a[7].r * b[5].i + a[7].i * b[5].r; + t2.r = q__1.r, + t2.i = q__1.i; + q__1.r = a[7].r * b[6].r - a[7].i * b[6].i, + q__1.i = a[7].r * b[6].i + a[7].i * b[6].r; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[8].r * b[7].r - a[8].i * b[7].i, + q__2.i = a[8].r * b[7].i + a[8].i * b[7].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = a[8].r * b[8].r - a[8].i * b[8].i, + q__2.i = a[8].r * b[8].i + a[8].i * b[8].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = a[8].r * b[9].r - a[8].i * b[9].i, + q__2.i = a[8].r * b[9].i + a[8].i * b[9].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[9].r * b[10].r - a[9].i * b[10].i, + q__2.i = a[9].r * b[10].i + a[9].i * b[10].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = -q__1.i; + q__2.r = a[9].r * b[11].r - a[9].i * b[11].i, + q__2.i = a[9].r * b[11].i + a[9].i * b[11].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = -q__1.i; + q__2.r = a[9].r * b[12].r - a[9].i * b[12].i, + q__2.i = a[9].r * b[12].i + a[9].i * b[12].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = -q__1.i; + + q__2.r = t1.r * c[4].r - t1.i * c[4].i, + q__2.i = t1.r * c[4].i + t1.i * c[4].r; + q__1.r = r[5].r + q__2.r, + q__1.i = r[5].i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t1.r * c[7].r - t1.i * c[7].i, + q__2.i = t1.r * c[7].i + t1.i * c[7].r; + q__1.r = r[8].r + q__2.r, + q__1.i = r[8].i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t1.r * c[10].r - t1.i * c[10].i, + q__2.i = t1.r * c[10].i + t1.i * c[10].r; + q__1.r = r[11].r + q__2.r, + q__1.i = r[11].i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t2.r * c[5].r - t2.i * c[5].i, + q__2.i = t2.r * c[5].i + t2.i * c[5].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t2.r * c[8].r - t2.i * c[8].i, + q__2.i = t2.r * c[8].i + t2.i * c[8].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t2.r * c[11].r - t2.i * c[11].i, + q__2.i = t2.r * c[11].i + t2.i * c[11].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t3.r * c[6].r - t3.i * c[6].i, + q__2.i = t3.r * c[6].i + t3.i * c[6].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + r[5].r = q__1.r, + r[5].i = q__1.i; + q__2.r = t3.r * c[9].r - t3.i * c[9].i, + q__2.i = t3.r * c[9].i + t3.i * c[9].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + r[8].r = q__1.r, + r[8].i = q__1.i; + q__2.r = t3.r * c[12].r - t3.i * c[12].i, + q__2.i = t3.r * c[12].i + t3.i * c[12].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + r[11].r = q__1.r, + r[11].i = q__1.i; + + q__1.r = a[10].r * b[4].r - a[10].i * b[4].i, + q__1.i = a[10].r * b[4].i + a[10].i * b[4].r; + t1.r = q__1.r, + t1.i = q__1.i; + q__1.r = a[10].r * b[5].r - a[10].i * b[5].i, + q__1.i = a[10].r * b[5].i + a[10].i * b[5].r; + t2.r = q__1.r, + t2.i = q__1.i; + q__1.r = a[10].r * b[6].r - a[10].i * b[6].i, + q__1.i = a[10].r * b[6].i + a[10].i * b[6].r; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[11].r * b[7].r - a[11].i * b[7].i, + q__2.i = a[11].r * b[7].i + a[11].i * b[7].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = a[11].r * b[8].r - a[11].i * b[8].i, + q__2.i = a[11].r * b[8].i + a[11].i * b[8].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = a[11].r * b[9].r - a[11].i * b[9].i, + q__2.i = a[11].r * b[9].i + a[11].i * b[9].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[12].r * b[10].r - a[12].i * b[10].i, + q__2.i = a[12].r * b[10].i + a[12].i * b[10].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = -q__1.i; + q__2.r = a[12].r * b[11].r - a[12].i * b[11].i, + q__2.i = a[12].r * b[11].i + a[12].i * b[11].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = -q__1.i; + q__2.r = a[12].r * b[12].r - a[12].i * b[12].i, + q__2.i = a[12].r * b[12].i + a[12].i * b[12].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = -q__1.i; + + q__2.r = t1.r * c[4].r - t1.i * c[4].i, + q__2.i = t1.r * c[4].i + t1.i * c[4].r; + q__1.r = r[6].r + q__2.r, + q__1.i = r[6].i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t1.r * c[7].r - t1.i * c[7].i, + q__2.i = t1.r * c[7].i + t1.i * c[7].r; + q__1.r = r[9].r + q__2.r, + q__1.i = r[9].i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t1.r * c[10].r - t1.i * c[10].i, + q__2.i = t1.r * c[10].i + t1.i * c[10].r; + q__1.r = r[12].r + q__2.r, + q__1.i = r[12].i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t2.r * c[5].r - t2.i * c[5].i, + q__2.i = t2.r * c[5].i + t2.i * c[5].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t2.r * c[8].r - t2.i * c[8].i, + q__2.i = t2.r * c[8].i + t2.i * c[8].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t2.r * c[11].r - t2.i * c[11].i, + q__2.i = t2.r * c[11].i + t2.i * c[11].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t3.r * c[6].r - t3.i * c[6].i, + q__2.i = t3.r * c[6].i + t3.i * c[6].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + r[6].r = q__1.r, + r[6].i = q__1.i; + q__2.r = t3.r * c[9].r - t3.i * c[9].i, + q__2.i = t3.r * c[9].i + t3.i * c[9].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + r[9].r = q__1.r, + r[9].i = q__1.i; + q__2.r = t3.r * c[12].r - t3.i * c[12].i, + q__2.i = t3.r * c[12].i + t3.i * c[12].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + r[12].r = q__1.r, + r[12].i = q__1.i; + + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_bwd_m.c b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_bwd_m.c new file mode 100644 index 0000000000000000000000000000000000000000..6d50ebce74b3f7d6bc889aad0010d79266909f02 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_bwd_m.c @@ -0,0 +1,353 @@ +/* +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! uuu_bwd_m.c - subtracts backward staple: r = r - a^\dagger b^\dagger c +! +!------------------------------------------------------------------------------- +*/ + +#include "types.h" + +#ifdef NamesToLower_ +# define UUU_BWD_M uuu_bwd_m_ +#endif + +#ifdef NamesToLower +# define UUU_BWD_M uuu_bwd_m +#endif + +void UUU_BWD_M(r, a, b, c) +COMPLEX8 *r, *a, *b, *c; +{ + register COMPLEX8 q__1, q__2; + register COMPLEX8 t1, t2, t3, x1, x2, x3; + + /* Parameter adjustments */ + c -= 4; + b -= 4; + a -= 4; + r -= 4; + + /* Function Body */ + q__1.r = -a[4].r * b[4].r + a[4].i * b[4].i, + q__1.i = -a[4].r * b[4].i - a[4].i * b[4].r; + t1.r = q__1.r, + t1.i = q__1.i; + q__1.r = -a[4].r * b[5].r + a[4].i * b[5].i, + q__1.i = -a[4].r * b[5].i - a[4].i * b[5].r; + t2.r = q__1.r, + t2.i = q__1.i; + q__1.r = -a[4].r * b[6].r + a[4].i * b[6].i, + q__1.i = -a[4].r * b[6].i - a[4].i * b[6].r; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = -a[5].r * b[7].r + a[5].i * b[7].i, + q__2.i = -a[5].r * b[7].i - a[5].i * b[7].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = -a[5].r * b[8].r + a[5].i * b[8].i, + q__2.i = -a[5].r * b[8].i - a[5].i * b[8].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = -a[5].r * b[9].r + a[5].i * b[9].i, + q__2.i = -a[5].r * b[9].i - a[5].i * b[9].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = -a[6].r * b[10].r + a[6].i * b[10].i, + q__2.i = -a[6].r * b[10].i - a[6].i * b[10].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = -q__1.i; + q__2.r = -a[6].r * b[11].r + a[6].i * b[11].i, + q__2.i = -a[6].r * b[11].i - a[6].i * b[11].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = -q__1.i; + q__2.r = -a[6].r * b[12].r + a[6].i * b[12].i, + q__2.i = -a[6].r * b[12].i - a[6].i * b[12].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = -q__1.i; + + q__2.r = t1.r * c[4].r - t1.i * c[4].i, + q__2.i = t1.r * c[4].i + t1.i * c[4].r; + q__1.r = r[4].r + q__2.r, + q__1.i = r[4].i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t1.r * c[7].r - t1.i * c[7].i, + q__2.i = t1.r * c[7].i + t1.i * c[7].r; + q__1.r = r[7].r + q__2.r, + q__1.i = r[7].i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t1.r * c[10].r - t1.i * c[10].i, + q__2.i = t1.r * c[10].i + t1.i * c[10].r; + q__1.r = r[10].r + q__2.r, + q__1.i = r[10].i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t2.r * c[5].r - t2.i * c[5].i, + q__2.i = t2.r * c[5].i + t2.i * c[5].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t2.r * c[8].r - t2.i * c[8].i, + q__2.i = t2.r * c[8].i + t2.i * c[8].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t2.r * c[11].r - t2.i * c[11].i, + q__2.i = t2.r * c[11].i + t2.i * c[11].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t3.r * c[6].r - t3.i * c[6].i, + q__2.i = t3.r * c[6].i + t3.i * c[6].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + r[4].r = q__1.r, + r[4].i = q__1.i; + q__2.r = t3.r * c[9].r - t3.i * c[9].i, + q__2.i = t3.r * c[9].i + t3.i * c[9].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + r[7].r = q__1.r, + r[7].i = q__1.i; + q__2.r = t3.r * c[12].r - t3.i * c[12].i, + q__2.i = t3.r * c[12].i + t3.i * c[12].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + r[10].r = q__1.r, + r[10].i = q__1.i; + + q__1.r = -a[7].r * b[4].r + a[7].i * b[4].i, + q__1.i = -a[7].r * b[4].i - a[7].i * b[4].r; + t1.r = q__1.r, + t1.i = q__1.i; + q__1.r = -a[7].r * b[5].r + a[7].i * b[5].i, + q__1.i = -a[7].r * b[5].i - a[7].i * b[5].r; + t2.r = q__1.r, + t2.i = q__1.i; + q__1.r = -a[7].r * b[6].r + a[7].i * b[6].i, + q__1.i = -a[7].r * b[6].i - a[7].i * b[6].r; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = -a[8].r * b[7].r + a[8].i * b[7].i, + q__2.i = -a[8].r * b[7].i - a[8].i * b[7].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = -a[8].r * b[8].r + a[8].i * b[8].i, + q__2.i = -a[8].r * b[8].i - a[8].i * b[8].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = -a[8].r * b[9].r + a[8].i * b[9].i, + q__2.i = -a[8].r * b[9].i - a[8].i * b[9].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = -a[9].r * b[10].r + a[9].i * b[10].i, + q__2.i = -a[9].r * b[10].i - a[9].i * b[10].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = -q__1.i; + q__2.r = -a[9].r * b[11].r + a[9].i * b[11].i, + q__2.i = -a[9].r * b[11].i - a[9].i * b[11].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = -q__1.i; + q__2.r = -a[9].r * b[12].r + a[9].i * b[12].i, + q__2.i = -a[9].r * b[12].i - a[9].i * b[12].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = -q__1.i; + + q__2.r = t1.r * c[4].r - t1.i * c[4].i, + q__2.i = t1.r * c[4].i + t1.i * c[4].r; + q__1.r = r[5].r + q__2.r, + q__1.i = r[5].i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t1.r * c[7].r - t1.i * c[7].i, + q__2.i = t1.r * c[7].i + t1.i * c[7].r; + q__1.r = r[8].r + q__2.r, + q__1.i = r[8].i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t1.r * c[10].r - t1.i * c[10].i, + q__2.i = t1.r * c[10].i + t1.i * c[10].r; + q__1.r = r[11].r + q__2.r, + q__1.i = r[11].i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t2.r * c[5].r - t2.i * c[5].i, + q__2.i = t2.r * c[5].i + t2.i * c[5].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t2.r * c[8].r - t2.i * c[8].i, + q__2.i = t2.r * c[8].i + t2.i * c[8].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t2.r * c[11].r - t2.i * c[11].i, + q__2.i = t2.r * c[11].i + t2.i * c[11].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t3.r * c[6].r - t3.i * c[6].i, + q__2.i = t3.r * c[6].i + t3.i * c[6].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + r[5].r = q__1.r, + r[5].i = q__1.i; + q__2.r = t3.r * c[9].r - t3.i * c[9].i, + q__2.i = t3.r * c[9].i + t3.i * c[9].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + r[8].r = q__1.r, + r[8].i = q__1.i; + q__2.r = t3.r * c[12].r - t3.i * c[12].i, + q__2.i = t3.r * c[12].i + t3.i * c[12].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + r[11].r = q__1.r, + r[11].i = q__1.i; + + q__1.r = -a[10].r * b[4].r + a[10].i * b[4].i, + q__1.i = -a[10].r * b[4].i - a[10].i * b[4].r; + t1.r = q__1.r, + t1.i = q__1.i; + q__1.r = -a[10].r * b[5].r + a[10].i * b[5].i, + q__1.i = -a[10].r * b[5].i - a[10].i * b[5].r; + t2.r = q__1.r, + t2.i = q__1.i; + q__1.r = -a[10].r * b[6].r + a[10].i * b[6].i, + q__1.i = -a[10].r * b[6].i - a[10].i * b[6].r; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = -a[11].r * b[7].r + a[11].i * b[7].i, + q__2.i = -a[11].r * b[7].i - a[11].i * b[7].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = -a[11].r * b[8].r + a[11].i * b[8].i, + q__2.i = -a[11].r * b[8].i - a[11].i * b[8].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = -a[11].r * b[9].r + a[11].i * b[9].i, + q__2.i = -a[11].r * b[9].i - a[11].i * b[9].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = -a[12].r * b[10].r + a[12].i * b[10].i, + q__2.i = -a[12].r * b[10].i - a[12].i * b[10].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = -q__1.i; + q__2.r = -a[12].r * b[11].r + a[12].i * b[11].i, + q__2.i = -a[12].r * b[11].i - a[12].i * b[11].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = -q__1.i; + q__2.r = -a[12].r * b[12].r + a[12].i * b[12].i, + q__2.i = -a[12].r * b[12].i - a[12].i * b[12].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = -q__1.i; + + q__2.r = t1.r * c[4].r - t1.i * c[4].i, + q__2.i = t1.r * c[4].i + t1.i * c[4].r; + q__1.r = r[6].r + q__2.r, + q__1.i = r[6].i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t1.r * c[7].r - t1.i * c[7].i, + q__2.i = t1.r * c[7].i + t1.i * c[7].r; + q__1.r = r[9].r + q__2.r, + q__1.i = r[9].i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t1.r * c[10].r - t1.i * c[10].i, + q__2.i = t1.r * c[10].i + t1.i * c[10].r; + q__1.r = r[12].r + q__2.r, + q__1.i = r[12].i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t2.r * c[5].r - t2.i * c[5].i, + q__2.i = t2.r * c[5].i + t2.i * c[5].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t2.r * c[8].r - t2.i * c[8].i, + q__2.i = t2.r * c[8].i + t2.i * c[8].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t2.r * c[11].r - t2.i * c[11].i, + q__2.i = t2.r * c[11].i + t2.i * c[11].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t3.r * c[6].r - t3.i * c[6].i, + q__2.i = t3.r * c[6].i + t3.i * c[6].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + r[6].r = q__1.r, + r[6].i = q__1.i; + q__2.r = t3.r * c[9].r - t3.i * c[9].i, + q__2.i = t3.r * c[9].i + t3.i * c[9].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + r[9].r = q__1.r, + r[9].i = q__1.i; + q__2.r = t3.r * c[12].r - t3.i * c[12].i, + q__2.i = t3.r * c[12].i + t3.i * c[12].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + r[12].r = q__1.r, + r[12].i = q__1.i; + + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_f90.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_f90.F90 new file mode 100644 index 0000000000000000000000000000000000000000..53ca89df005fe7caa5aab26fc99f99964534a967 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_f90.F90 @@ -0,0 +1,73 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! uuu_f90.F90 - Fortran loops for (U * U * U) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine uuu_bwd(r, a, b, c) ! adds backward staple: + ! r = r + a^\dagger b^\dagger c + implicit none + SU3 :: r, a, b, c + integer :: i, j, k, m + + do i = 1, NCOL + do j = 1, NCOL + do k = 1, NCOL + do m = 1, NCOL + r(i,j) = r(i,j) + conjg(a(k,i)) * conjg(b(m,k)) * c(m,j) + enddo + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine uuu_bwd_m(r, a, b, c) ! subtracts backward staple: + ! r = r - a^\dagger b^\dagger c + implicit none + SU3 :: r, a, b, c + integer :: i, j, k, m + + do i = 1, NCOL + do j = 1, NCOL + do k = 1, NCOL + do m = 1, NCOL + r(i,j) = r(i,j) - conjg(a(k,i)) * conjg(b(m,k)) * c(m,j) + enddo + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine uuu_fwd(r, a, b, c) ! adds forward staple: + ! r = r + a b^\dagger c^\dagger + implicit none + SU3 :: r, a, b, c + integer :: i, j, k, m + + do i = 1, NCOL + do j = 1, NCOL + do k = 1, NCOL + do m = 1, NCOL + r(i,j) = r(i,j) + a(i,k) * conjg(b(m,k)) * conjg(c(j,m)) + enddo + enddo + enddo + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_fwd.c b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_fwd.c new file mode 100644 index 0000000000000000000000000000000000000000..c0b544526f0e410f19639d6873d0b6ed4b9dcf08 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_fwd.c @@ -0,0 +1,353 @@ +/* +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! uuu_fwd.c - adds forward staple: r = r + a b^\dagger c^\dagger +! +!------------------------------------------------------------------------------- +*/ + +#include "types.h" + +#ifdef NamesToLower_ +# define UUU_FWD uuu_fwd_ +#endif + +#ifdef NamesToLower +# define UUU_FWD uuu_fwd +#endif + +void UUU_FWD(r, a, b, c) +COMPLEX8 *r, *a, *b, *c; +{ + register COMPLEX8 q__1, q__2; + register COMPLEX8 t1, t2, t3, x1, x2, x3; + + /* Parameter adjustments */ + c -= 4; + b -= 4; + a -= 4; + r -= 4; + + /* Function Body */ + q__1.r = a[4].r * b[4].r + a[4].i * b[4].i, + q__1.i = -a[4].r * b[4].i + a[4].i * b[4].r; + t1.r = q__1.r, + t1.i = q__1.i; + q__1.r = a[4].r * b[5].r + a[4].i * b[5].i, + q__1.i = -a[4].r * b[5].i + a[4].i * b[5].r; + t2.r = q__1.r, + t2.i = q__1.i; + q__1.r = a[4].r * b[6].r + a[4].i * b[6].i, + q__1.i = -a[4].r * b[6].i + a[4].i * b[6].r; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[7].r * b[7].r + a[7].i * b[7].i, + q__2.i = -a[7].r * b[7].i + a[7].i * b[7].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = a[7].r * b[8].r + a[7].i * b[8].i, + q__2.i = -a[7].r * b[8].i + a[7].i * b[8].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = a[7].r * b[9].r + a[7].i * b[9].i, + q__2.i = -a[7].r * b[9].i + a[7].i * b[9].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[10].r * b[10].r + a[10].i * b[10].i, + q__2.i = -a[10].r * b[10].i + a[10].i * b[10].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = a[10].r * b[11].r + a[10].i * b[11].i, + q__2.i = -a[10].r * b[11].i + a[10].i * b[11].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = a[10].r * b[12].r + a[10].i * b[12].i, + q__2.i = -a[10].r * b[12].i + a[10].i * b[12].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + + q__2.r = t1.r * c[4].r + t1.i * c[4].i, + q__2.i = -t1.r * c[4].i + t1.i * c[4].r; + q__1.r = r[4].r + q__2.r, + q__1.i = r[4].i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t1.r * c[5].r + t1.i * c[5].i, + q__2.i = -t1.r * c[5].i + t1.i * c[5].r; + q__1.r = r[7].r + q__2.r, + q__1.i = r[7].i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t1.r * c[6].r + t1.i * c[6].i, + q__2.i = -t1.r * c[6].i + t1.i * c[6].r; + q__1.r = r[10].r + q__2.r, + q__1.i = r[10].i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t2.r * c[7].r + t2.i * c[7].i, + q__2.i = -t2.r * c[7].i + t2.i * c[7].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t2.r * c[8].r + t2.i * c[8].i, + q__2.i = -t2.r * c[8].i + t2.i * c[8].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t2.r * c[9].r + t2.i * c[9].i, + q__2.i = -t2.r * c[9].i + t2.i * c[9].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t3.r * c[10].r + t3.i * c[10].i, + q__2.i = -t3.r * c[10].i + t3.i * c[10].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + r[4].r = q__1.r, + r[4].i = q__1.i; + q__2.r = t3.r * c[11].r + t3.i * c[11].i, + q__2.i = -t3.r * c[11].i + t3.i * c[11].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + r[7].r = q__1.r, + r[7].i = q__1.i; + q__2.r = t3.r * c[12].r + t3.i * c[12].i, + q__2.i = -t3.r * c[12].i + t3.i * c[12].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + r[10].r = q__1.r, + r[10].i = q__1.i; + + q__1.r = a[5].r * b[4].r + a[5].i * b[4].i, + q__1.i = -a[5].r * b[4].i + a[5].i * b[4].r; + t1.r = q__1.r, + t1.i = q__1.i; + q__1.r = a[5].r * b[5].r + a[5].i * b[5].i, + q__1.i = -a[5].r * b[5].i + a[5].i * b[5].r; + t2.r = q__1.r, + t2.i = q__1.i; + q__1.r = a[5].r * b[6].r + a[5].i * b[6].i, + q__1.i = -a[5].r * b[6].i + a[5].i * b[6].r; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[8].r * b[7].r + a[8].i * b[7].i, + q__2.i = -a[8].r * b[7].i + a[8].i * b[7].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = a[8].r * b[8].r + a[8].i * b[8].i, + q__2.i = -a[8].r * b[8].i + a[8].i * b[8].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = a[8].r * b[9].r + a[8].i * b[9].i, + q__2.i = -a[8].r * b[9].i + a[8].i * b[9].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[11].r * b[10].r + a[11].i * b[10].i, + q__2.i = -a[11].r * b[10].i + a[11].i * b[10].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = a[11].r * b[11].r + a[11].i * b[11].i, + q__2.i = -a[11].r * b[11].i + a[11].i * b[11].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = a[11].r * b[12].r + a[11].i * b[12].i, + q__2.i = -a[11].r * b[12].i + a[11].i * b[12].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + + q__2.r = t1.r * c[4].r + t1.i * c[4].i, + q__2.i = -t1.r * c[4].i + t1.i * c[4].r; + q__1.r = r[5].r + q__2.r, + q__1.i = r[5].i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t1.r * c[5].r + t1.i * c[5].i, + q__2.i = -t1.r * c[5].i + t1.i * c[5].r; + q__1.r = r[8].r + q__2.r, + q__1.i = r[8].i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t1.r * c[6].r + t1.i * c[6].i, + q__2.i = -t1.r * c[6].i + t1.i * c[6].r; + q__1.r = r[11].r + q__2.r, + q__1.i = r[11].i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t2.r * c[7].r + t2.i * c[7].i, + q__2.i = -t2.r * c[7].i + t2.i * c[7].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t2.r * c[8].r + t2.i * c[8].i, + q__2.i = -t2.r * c[8].i + t2.i * c[8].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t2.r * c[9].r + t2.i * c[9].i, + q__2.i = -t2.r * c[9].i + t2.i * c[9].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t3.r * c[10].r + t3.i * c[10].i, + q__2.i = -t3.r * c[10].i + t3.i * c[10].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + r[5].r = q__1.r, + r[5].i = q__1.i; + q__2.r = t3.r * c[11].r + t3.i * c[11].i, + q__2.i = -t3.r * c[11].i + t3.i * c[11].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + r[8].r = q__1.r, + r[8].i = q__1.i; + q__2.r = t3.r * c[12].r + t3.i * c[12].i, + q__2.i = -t3.r * c[12].i + t3.i * c[12].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + r[11].r = q__1.r, + r[11].i = q__1.i; + + q__1.r = a[6].r * b[4].r + a[6].i * b[4].i, + q__1.i = -a[6].r * b[4].i + a[6].i * b[4].r; + t1.r = q__1.r, + t1.i = q__1.i; + q__1.r = a[6].r * b[5].r + a[6].i * b[5].i, + q__1.i = -a[6].r * b[5].i + a[6].i * b[5].r; + t2.r = q__1.r, + t2.i = q__1.i; + q__1.r = a[6].r * b[6].r + a[6].i * b[6].i, + q__1.i = -a[6].r * b[6].i + a[6].i * b[6].r; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[9].r * b[7].r + a[9].i * b[7].i, + q__2.i = -a[9].r * b[7].i + a[9].i * b[7].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = a[9].r * b[8].r + a[9].i * b[8].i, + q__2.i = -a[9].r * b[8].i + a[9].i * b[8].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = a[9].r * b[9].r + a[9].i * b[9].i, + q__2.i = -a[9].r * b[9].i + a[9].i * b[9].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[12].r * b[10].r + a[12].i * b[10].i, + q__2.i = -a[12].r * b[10].i + a[12].i * b[10].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = a[12].r * b[11].r + a[12].i * b[11].i, + q__2.i = -a[12].r * b[11].i + a[12].i * b[11].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = a[12].r * b[12].r + a[12].i * b[12].i, + q__2.i = -a[12].r * b[12].i + a[12].i * b[12].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + + q__2.r = t1.r * c[4].r + t1.i * c[4].i, + q__2.i = -t1.r * c[4].i + t1.i * c[4].r; + q__1.r = r[6].r + q__2.r, + q__1.i = r[6].i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t1.r * c[5].r + t1.i * c[5].i, + q__2.i = -t1.r * c[5].i + t1.i * c[5].r; + q__1.r = r[9].r + q__2.r, + q__1.i = r[9].i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t1.r * c[6].r + t1.i * c[6].i, + q__2.i = -t1.r * c[6].i + t1.i * c[6].r; + q__1.r = r[12].r + q__2.r, + q__1.i = r[12].i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t2.r * c[7].r + t2.i * c[7].i, + q__2.i = -t2.r * c[7].i + t2.i * c[7].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t2.r * c[8].r + t2.i * c[8].i, + q__2.i = -t2.r * c[8].i + t2.i * c[8].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t2.r * c[9].r + t2.i * c[9].i, + q__2.i = -t2.r * c[9].i + t2.i * c[9].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t3.r * c[10].r + t3.i * c[10].i, + q__2.i = -t3.r * c[10].i + t3.i * c[10].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + r[6].r = q__1.r, + r[6].i = q__1.i; + q__2.r = t3.r * c[11].r + t3.i * c[11].i, + q__2.i = -t3.r * c[11].i + t3.i * c[11].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + r[9].r = q__1.r, + r[9].i = q__1.i; + q__2.r = t3.r * c[12].r + t3.i * c[12].i, + q__2.i = -t3.r * c[12].i + t3.i * c[12].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + r[12].r = q__1.r, + r[12].i = q__1.i; + + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/w_mult.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/w_mult.F90 new file mode 100644 index 0000000000000000000000000000000000000000..1fafce298e018acbddfdb050ec791601a411a9d5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/w_mult.F90 @@ -0,0 +1,78 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! w_mult.F90 - W := M~ + rho (Hasenbusch improvement) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine w_mult(out, in, para, conf) ! out = W in + + use typedef_hmc + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + + SPINCOL_FIELD, intent(out) :: out + SPINCOL_FIELD, intent(in) :: in + + call mtil(out, in, para, conf) + + call sc_axpy(out, in, para%rho) + +end + +!------------------------------------------------------------------------------- +subroutine w_mult_dag(out, in, para, conf) ! out = W+ in + + use typedef_hmc + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + + SPINCOL_FIELD, intent(out) :: out + SPINCOL_FIELD, intent(in) :: in + + call mtil_dag(out, in, para, conf) + + call sc_axpy(out, in, para%rho) + +end + +!------------------------------------------------------------------------------- +subroutine w_dagger_w(out, in, para, conf) ! out = (W+ W) in + + use typedef_hmc + use module_p_interface + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + + SPINCOL_FIELD :: out, in + P_SPINCOL_FIELD, save :: tmp + + TIMING_START(timing_bin_mtdagmt) + + ALLOCATE_SC_FIELD(tmp) + + call w_mult(tmp, in, para, conf) + call w_mult_dag(out, tmp, para, conf) + + TIMING_STOP(timing_bin_mtdagmt) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/xyzt2i.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/xyzt2i.F90 new file mode 100644 index 0000000000000000000000000000000000000000..d87af69ff7a72b83981bed4df13362b5b8c64aa4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/xyzt2i.F90 @@ -0,0 +1,97 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! xyzt2i.F90 - maps local coordinates (x,y,z,t) to even/odd index +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +integer function xyzt2i(x_in) + + ! x_in := (x,y,z,t) + ! -1 <= x(mu) <= N(mu) ; mu = 1,2,3,4 + ! xyzt2i >= 1 + + use module_function_decl + use module_lattice + use module_offset + implicit none + + integer, dimension (DIM), intent(in) :: x_in + integer, dimension (DIM) :: dir, i, m, x + integer :: count, mu + integer, external :: ieo, n_sites, i_periodic, ilex + + + count = 0 + do mu = 1, DIM + + x(mu) = x_in(mu) + + if (x(mu) < -1 .or. x(mu) > N(mu)) then + call die('xyzt2i(): x(mu) out of range') + endif + + if (NPE(mu) == 1) x(mu) = i_periodic(x(mu), N(mu)) + + if (x(mu) == -1) then + dir(mu) = -1 + elseif (x(mu) == N(mu)) then + dir(mu) = 1 + else + dir(mu) = 0 + count = count + 1 + endif + + if (dir(mu) /= 0) then + i(mu) = 0 + m(mu) = 1 + else + i(mu) = x(mu) + m(mu) = N(mu) + endif + enddo + + if (count == DIM) then + xyzt2i = offset(0,0,0,0) + ieo(DIM, x, N) + 1 + else + ASSERT(num_pes() /= 1) + + if (dir(1) /= 0) then + !!ASSERT(n_sites(DIM, dir, N, NPE) == n_sites(DIM, dir, NH, NPE)) + !!ASSERT(ilex(DIM, i, m) <= n_sites(DIM, dir, NH, NPE)) + xyzt2i = offset(dir(1),dir(2),dir(3),dir(4)) + ilex(DIM, i, m) + 1 + else + xyzt2i = offset(dir(1),dir(2),dir(3),dir(4)) + ieo(DIM, i, m) + 1 + endif + endif + +end + +!------------------------------------------------------------------------------- +integer function std_xyzt2i(x) + + use module_lattice + implicit none + + integer, dimension (DIM), intent(in) :: x + integer, dimension (DIM) :: x_act + integer, external :: xyzt2i + + x_act(1) = x(gamma_index(1)) + x_act(2) = x(gamma_index(2)) + x_act(3) = x(gamma_index(3)) + x_act(4) = x(gamma_index(4)) + + std_xyzt2i = xyzt2i(x_act) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_B/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..31b2207cc57537d5ff8f443a772d78649b7167ed --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/Makefile @@ -0,0 +1,10 @@ +include Makefile.defs + +#MODULE_INIT# +#MODULE_CMD# #MODULE_FILES# + +kernel: + cd libraries && $(MAKE) all + cd su3h_n && $(MAKE) kernel-objects + $(AR) $(ARFLAGS) ../kernel_B.a libraries/*.o su3h_n/*.o + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/Makefile.defs.in b/qcd/part_cpu/applications/QCD/src/kernel_B/Makefile.defs.in new file mode 100644 index 0000000000000000000000000000000000000000..3a5a290170bef0e2a21cc3daa385544ec63bbdab --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/Makefile.defs.in @@ -0,0 +1,13 @@ +MAKE = #MAKE# + +RM = #RM# + +AR = #AR# +ARFLAGS = #ARFLAGS# + +CC = #CC# +CFLAGS = #CFLAGS# + +MPI_CC = #MPI_CC# + +LDFLAGS = #LDFLAGS# diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/aa/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..0b41dc11a810a6d13720e9cbef1b7e298dc700e7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/Makefile @@ -0,0 +1,12 @@ +OBJ = calclist.o dblarr.o halt.o io_unformat.o jacobi.o + +.PHONY : clean + +aa : $(OBJ) + $(CC) aa.c $(OBJ) -o aa -lm + +%.o : %.c + $(CC) -c $< -o $@ + +clean : + rm -f *.o diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/aa/aa.c b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/aa.c new file mode 100644 index 0000000000000000000000000000000000000000..f92c87803c6e540f8ddd1aaf7ba110d59a359a9a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/aa.c @@ -0,0 +1,1283 @@ +/************************************************************************* + * + * This c-program analyzes autocorrelations from + * histogram datafiles + * + * aa [opt] 'name' + * input files: + * histogram file + * output to standard-output + * + * COMPILE: + * cc -o aa aa.c io_unformat.c calclist.c dblarr.c jacobi.c halt.c -lm + * + *************************************************************************/ + + +#include +#include +#include +#include +#include +/* #include */ +#include + +#include "stuff.h" + +/* #include */ + +/* #define max(x,y) (((x) > (y)) ? (x) : (y)) */ + +/* prototypes... + */ + + +#define MAX_DATA 10000 + +#define PI 3.14159265358979 +#define pi2 (PI*2.0) + +double * dblalloc(int num); +void halt(char *message,char *msg); +int ccorr_matrix(double *dat[],int n,int length,int jack, + double *wd,int weight,double wnorm); +int history(int nv, double a[],int showlevel,double level_val, + double wd[],int weight); +int errorcalc(int nconf,double svect[],double *avep,double *sigp, + double *fsp,double *fser); +int autocorr(double *d,int nd,double *res,int nres); +int histgr2(double *a,double *b,int ndata,int bins,double *wd,int weight, + double low1,double high1,double low2,double high2); +double tunnelcalc(int nd,double *d,double tmin,double tmax); + +int noauto = 0; + +char usage[] = " usage: aa [-opt] hist-file\n\ + q : quiet\n\ + j nblocks : jackknife data\n\ + J nblocks : print jackknife blocks\n\ + A : print all measurements as jackknife blocks\n\ + i iters : use only i iterations\n\ + s iters : skip iteration\n\ + S n : use only every n iteration\n\ + d num : use only data num\n\ + D #2+#3,#4 : collect data and do the arithmetic [+-*/]\n\ + R r1:r2 : print data from r1 to r2\n\ + f num=val : filter data with vector num = value\n\ + l[L] a:b : data range from a to b [element 2]\n\ + X : print just the data indicated\n\ + C : cross-correlation of data\n\ + E : eigenvalue analysis of cross-correlation matrix\n\ + c num : correlation number num\n\ + cl length : force vector length\n\ + cc/cd num : print 2-dim. array number num\n\ + cos : cosine transform the data (only with -j, -A)\n\ + a : sum -D or -c into one data element\n\ + x d,i,num : datavector element d[index[num]], num optional\n\ + K n : take modulo of index\n\ + h : print histogram plot\n\ + v value : show value in the histogram\n\ + V : divide value by volume\n\ + Q : multiply value by sqrt of volume\n\ + w : use the weight factor (elem 1)\n\ + 0 : subtract the 0-moment (average)\n\ + 2 : print second moment\n\ + p power : print moment power\n\ + F bins : print histogram in weight format\n\ + r block : do the 'running blocking'\n\ + b block : do the 'normal blocking'\n\ + m/M value : analyze only up to minimum/maximum value\n\ + O : list measurements up to minimum/maximum value\n\ + P : lag the list by one unit\n\ + g [bins] : print histogram to the standard output, with bins bins\n\ + B [binw] : print histogram, with binwidth [1]\n\ + H : print history to the standard output\n\ + G bins : 2-component histogram\n\ + W b_old,b_new,\'act_string\' : reweight to new beta value (up to 10 times)\n\ + Y bg,mU_old,mU_new,T_old,T_new : reweight to new T value (susy)\n\ + Z mH_old,mH_new,b_old,b_new : reweight to new mH, betaH -values (su2-Higgs)\n\ + y tmin:tmax : print tunneling time\n\ + t : no autocorrelations\n\ + T length : print autocorrelation function for distance length\n"; + +main(argc,argv) + int argc; + char * argv[]; + +{ + double *fp,*dat[MAX_DATA],*tmparr,*wd; + double *tmpx; + int block,indexed,iv,dv,index; + int nblocking,length,jack,sum; + double temps,minval,maxval,tmin,tmax; + char * ss,*lists[MAX_DATA]; + int raw,eig; + int idata,nd; + int i,j,jj,k,iters,hist,printhist; + long lk; + int gram,rblock,bblock,skip,weight; + double hbin,error,naive,aveg,average[MAX_DATA],timerel,level_val; + int list,limits,lagged,showlevel,stagger,mom2,ccorr; + int vol,volume,wgram,irange1,irange2,icorr,printjack,index_mod; + int four,h2gram,reweight,susy,higgs,isarray,tunnel; + int quiet,sqrtV,icorrlen; + int filter; + double filtervalue; + double betar,wnorm,vnorm; + double mU,mUn,mH,mH2,T,Tn; + int autocorrelation = 0,sub_ave = 0; + double power = 0,beta1[10],beta2[10]; + char rwstring[10][100]; + FILE *ff; + e_header h; + double low1,high1,low2,high2; + double susy_mul,betag; + + hbin = 0.0; + block = nblocking = iters = hist = printhist = gram = idata = + rblock = bblock = jack = list = limits = stagger = mom2 = quiet = 0; + lagged = showlevel = weight = volume = wgram = 0; + irange1 = irange2 = icorr = icorrlen = 0; + ccorr = sqrtV = sum = four = h2gram = 0; + printjack = indexed = index_mod = reweight = 0; + susy = higgs = raw = eig = isarray = 0; + filter = tunnel = 0; + nd = 0; + minval = -1e-60; + maxval = 1e60; + + low1=low2=high1=high2=0; + + skip = 0; + if (argc <= 1) halt(usage,NULL); + + while (--argc > 0 && (*++argv)[0] == '-') { + ss = argv[0] + 1; + + while (*ss) { + switch(*ss++) { + + case 'q': quiet = 1; break; + case 'O': list = 1; break; + case 'P': lagged = 1; break; + case 't': noauto = 1; break; + case 'T': getnum("%d",&autocorrelation); break; + case 'y': tunnel = 1; get2num("%lg:%lg",&tmin,&tmax); break; + case 'w': weight = 1; break; + case '0': sub_ave = 1; break; + case '2': mom2 = 1; break; + case 'a': sum = 1; break; + case 'p': getnum("%lg",&power); break; + case 'F': wgram = weight = volume = 1; idata = 4; + getnum("%d",&gram); break; + case 'V': volume = 1; break; + case 'Q': sqrtV = 1; break; + case 'c': + if (strcmp(ss,"os") == 0) { four = 1; ss+=2; } + else if (*ss == 'l') { ss++; getnum("%d",&icorrlen); } + else { + if (*ss == 'c') { isarray=1; ss++; } + if (*ss == 'd') { isarray=2; ss++; } + getnum("%d",&icorr); + } + break; + case 'm': limits = 1; getnum("%lg",&minval); break; + case 'M': limits = 1; getnum("%lg",&maxval); break; + case 'i': getnum("%d",&iters); break; + case 's': getnum("%d",&skip); break; + case 'S': getnum("%d",&stagger); break; + case 'r': getnum("%d",&rblock); break; + case 'b': getnum("%d",&bblock); break; + case 'h': hist = 1; break; + case 'H': printhist = 1; break; + case 'v': showlevel = 1; getnum("%lg",&level_val); break; + case 'j': getnum("%d",&jack); break; + case 'J': printjack = 1; getnum("%d",&jack); break; + case 'A': printjack = 1; jack = -1; break; + case 'W': + /* take away white space */ + { char *p,*q; + p = q = ss; + while (*p) { + if (*p != ' ' && *p != '\t') *(q++) = *p; + p++; + } + *q = 0; + } + get3num("%lg,%lg,%s",&beta1[reweight],&beta2[reweight],rwstring[reweight]); + beta2[reweight] -= beta1[reweight]; + reweight++; + break; + case 'Y': reweight = susy = 1; + get5num("%lg,%lg,%lg,%lg,%lg",&betag,&mU,&mUn,&T,&Tn); + break; + case 'Z': reweight = higgs = 1; + get4num("%lg,%lg,%lg,%lg",&mH,&mH2,&beta1[0],&beta2[0]); + break; + + case 'd': getnum("%d",&idata); nd = 1; break; + case 'D': getlist(lists,nd); break; + + case 'f': + get2num("%d=%lg",&filter,&filtervalue); + break; + + case 'X': raw = 1; break; + case 'E': eig = 1; break; + + case 'R': + if (!(*ss) && --argc) ss = (++argv)[0]; + if (sscanf(ss,"%d:%d",&irange1,&irange2) != 2) halt(usage,NULL); + ss = strchr(ss,0); + break; + + case 'x': + indexed = 1; + if (!(*ss) && --argc) ss = (++argv)[0]; + if ((i = sscanf(ss,"%d,%d,%d",&dv,&iv,&index)) == 3) { + irange1 = irange2 = index; + } else if (i == 2) index = 0; + else halt(usage,NULL); + ss = strchr(ss,0); + break; + + case 'K': getnum("%d",&index_mod); break; + + case 'C': ccorr = 1; break; + + case 'B': getoptnum("%lg",&hbin,1.0); gram = 1; break; + case 'g': getoptnum("%d",&gram,100); break; + case 'G': getnum("%d",&h2gram); break; + + case 'l': get2num("%lg:%lg",&low1,&high1); break; + case 'L': get2num("%lg:%lg",&low2,&high2); break; + + default: halt(usage,NULL); + + } /* switch */ + } + } /* while */ + + if (argc == 0) halt(usage,NULL); + + /* now find the histogram file length */ + + ss = argv[0]; + + ff = fopen(ss,"r"); + if (ff == NULL) halt("Could not open file %s",ss); + + block = readheader(ff,&h); + if (icorrlen) h.d2 = icorrlen; + tmparr = dblarr(block); + + vol = h.lx*h.ly*h.lz*h.lt; + + if (indexed && !irange1) icorr = 1; + + if (icorr) { + if (h.d1 >= icorr || icorrlen) { + irange1 = 1 + (icorr-1)*h.d2; + irange2 = irange1 + h.d2 - 1; + } else if (h.d1 + h.d3 >= icorr) { + irange1 = 1 + h.d1*h.d2 + (icorr-1-h.d1)*h.d4; + irange2 = irange1 + h.d4 - 1; + } else halt("No such thing!",NULL); + } + + if (indexed) tmpx = dblarr(irange2-irange1+1); + + /* do not assume only doubles */ + if (iters) iters += skip; + + length = 0; + do { + length++; + lk = readdata(ff,tmparr); + } while (lk == length && iters != length); + if (lk != length) length--; + rewind(ff); + skipheader(ff); + if (icorrlen) h.d2 = icorrlen; + + length -= (skip); + if (!quiet) { + fprintf(stderr,"* Data: %d Measurements: %d\n",block,length); + fprintf(stderr," double %ld, float %ld, long %ld, char %ld\n", + h.n_double,h.n_float,h.n_long,h.n_char); + } + + if (length <= 0) halt("Measurements == 0",NULL); + if (jack < 0) jack = length; + + if (volume) vnorm = 1.0/vol; else vnorm = 1; + if (sqrtV) vnorm *= sqrt(vol); + + if (!raw) { + if (nd) { + for (i=0; i block) halt("Illegal data number",NULL); + if (irange1 > irange2 || irange2 > block) halt("Illegal range spec",NULL); + + if (weight || reweight) wd = dblarr(length); + + if (susy) susy_mul = pow(4.0/(betag * 4.0/9.0),3.0) * (h.lx*h.ly*h.lz); + + j = skip; + while (j-- > 0) lk = readdata(ff,tmparr); + + if (idata) fp = dat[0]; + for (jj=j=0; jj < length; jj++) { + extern int calclist_index; + + lk = readdata(ff,tmparr); + + calclist_index = jj; + + k=0; + if (!filter || tmparr[filter-1] == filtervalue) { + + if (reweight || weight) { + double w; + static int first=1; + static double w0; + + if (susy) + /*** THis is the `old' susy weight + w = 0.379*(sqr(100.0/T) - sqr(100.0/Tn)) * tmparr[8] + + 0.849*(sqr(mU/T) - sqr(mUn/Tn)) * tmparr[16]; + *****/ + + w = -susy_mul*( (1.0/sqr(T) - 1.0/sqr(Tn)) * + ( 18384.1*tmparr[3] - 3984.08*tmparr[4] - 2*1191.72*tmparr[7] + + 2*96.6867*tmparr[8] ) + - (sqr(mU/T) - sqr(mUn/Tn))*tmparr[14] ); + + else if (higgs) w = (beta2[0]/beta1[0]-1)*tmparr[2] + + (sqr(beta2[0]*mH2/(beta1[0]*mH))-1)*tmparr[4]; + else if (reweight) + for (w=i=0; i= minval) { + if (lagged && !lag_on) lag_on = 1; + else { + j++; + if (!isaccept) { + isaccept = 1; + begmeas = i; + } + } + } else { + lag_on = 0; + if (isaccept) { + isaccept = 0; + printf("%d %d\n",begmeas,i-1); + } + } + } + if (isaccept) printf("%d %d\n",begmeas,length-1); + + if (!quiet) fprintf(stderr," - accepted %d of %d measurements\n",j,length); + exit(-1); + } + + if (limits) { + double *dd[MAX_DATA],mval,Mval; + + for (idata=irange1; idata<=irange2; idata++) dd[idata] = dblarr(length); + for (i=j=0; i= minval) { + for (idata=irange1; idata<=irange2; idata++) + dd[idata][j] = dat[idata][i]; + j++; + } + } + for (idata=irange1; idata<=irange2; idata++) { + free(dat[idata]); + dat[idata] = dd[idata]; + } + if (!quiet) fprintf(stderr," - accepted %d of %d measurements\n",j,length); + length = j; + if (jack > length) jack = length; + } + + if (stagger) { + double * dd,val; + + for (idata=irange1; idata<=irange2; idata++) { + dd = (double *)calloc(length/stagger+1,sizeof(double)); + for (i=0; i length) jack = length; + } + + if (rblock) { + double * dd,val; + + for (idata=irange1; idata<=irange2; idata++) { + dd = (double *)calloc(length-rblock+1,sizeof(double)); + for (val=i=0; i length) jack = length; + } + + idata = irange1; + + if (printjack && gram) { + printhgram(dat[0],gram,length,wd,weight,jack,hbin); + return(1); + } + + if (printjack || jack) { + double *jave,*jnorm; + int dl,id; + + dl = irange2-irange1+1; + jave = dblarr(dl*jack); + jnorm = dblarr(jack); + + for (j=0; j 1) ftrans(jave+j*dl,dl); + if (printjack) { + for (idata=irange1; idata<=irange2; idata++) { + id = idata-irange1; + if (!isarray) printf("%d %.16lg\n",id,jave[id+j*dl]); + else if (isarray == 1) + printf("%ld %ld %.16lg\n",id/h.d3,id%h.d3,jave[id+j*dl]); + else { + printf("%.16lg ",jave[id+j*dl]); + if ((id+1)%h.d3 == 0) printf("\n"); + } + + /* fprintf(stderr,"."); fflush(stderr); */ + } + } + } + if (!printjack) { + double aveg,error; + + for (idata=irange1; idata<=irange2; idata++) { + id = idata-irange1; + aveg = error = 0.0; + for (j=0; j p[i]) ? p[i] : minv; + } + + if (hbin <= 0) { + hg = (double *)calloc(bins,sizeof(double)); + d = (maxv-minv)/(bins-1); + bn = bins; + maxv += d/2; minv -= d/2; + xadd = 0.5; + } else { + maxv = hbin * ceil(maxv/hbin); + minv = hbin * floor(minv/hbin); + bins = maxv/hbin - minv/hbin + 1; + bn = bins-1; + hg = (double *)calloc(bins,sizeof(double)); + d = hbin; + xadd = 0; + } + + if (jack < 2) { + + if (!weight) + for (i=0; i a[i]) ? a[i] : mina; + } + } + d = (maxa-mina)/(bins-1); + maxa += d/2; mina -= d/2; + + if (maxb <= minb) { + maxb = -1e60; minb = -maxb; + for (i=0; i b[i]) ? b[i] : minb; + } + } + d = (maxb-minb)/(bins-1); + maxb += d/2; minb -= d/2; + + for (i=0; i ac[j]) ? max : ac[j]; + } + + for (j=0; j= min && level_val <= max) { + for (j=0; j=0; j--) printf("%s\n",level[j]); + + free(his); + + for (i=0; i <= NLEV; i++) { + sp = level[i]; + for (j=0; j< bins; j++) sp[j] = ' '; + sp[j] = 0; + } + + if (showlevel && level_val >= min && level_val <= max) { + for (j=0; j ac[i]) ? dmax : ac[i]; + k++; + } + dave /= k; + dsig = sqrt(dsig/k - sqr(dave)); + level[ (int)(NLEV*(dmin-min)/(max-min)) ][j] = '.'; + level[ (int)(NLEV*(dmax-min)/(max-min)) ][j] = '.'; + if (dave+dsig <= max) + level[ (int)(NLEV*(dave+dsig-min)/(max-min)) ][j] = '-'; + if (dave-dsig >= min) + level[ (int)(NLEV*(dave-dsig-min)/(max-min)) ][j] = '-'; + level[ (int)(NLEV*(dave-min)/(max-min)) ][j] = '*'; + } + + printf("\nblocked history with %d blocks, size %d ...\n",bins,nv/bins); + + for (j=NLEV; j>=0; j--) printf("%s\n",level[j]); + + return(1); +} + +/******************************************************************** + * Assume here periodicity so that d[2*nd - 1 - i] = d[i] + */ + +int +ccorr_matrix(double *dat[],int n,int length,int jack, + double *wd,int weight,double wnorm) +{ + int i,j,k; + double *a,*w,*eval,*evec; + + a = dblarr(n); + w = dblarr(n*n); + eval = dblarr(n); + evec = dblarr(n*n); + + if (!weight) wnorm = 1.0/length; else wnorm /= length; + + for (j=0; j 1 && (sig == 0.0 || sig < 1.0e-12*sqr(ave) || noauto)) { + /* if (!noauto) fprintf(stderr," ** small sigma/ave: %g/%g\n",sqrt(sig),ave); */ + *avep = ave; + *sigp = sqrt(sig/(nconf-1)); + *fsp = *fser = 0.5; + return(1); + } + + if (nconf <= 1) { + *avep = ave; + *sigp = 0.0; + *fsp = *fser = 0.5; + return(1); + } + + /* time correlations */ + + it = 0; + fs = 0.5; + do { + it++; + ax = ay = fi = 0.0; + + nc=nconf-it; + for (j=0; j it && it < nconf/2); + + if (it >= nconf/2) fprintf(stderr," ** correlation > N/2*%d\n",TINT); + + sig=sqrt(sig/(nconf-1)); + + *sigp = sig; + *avep = ave; + *fsp = fs; + *fser = fs*sqrt(2.*(2.*it+1)/nc); + + return(0); +} + + +/******************************************************************** + * this routine calculates autocorrelation function + */ + +int +autocorr(d,nd,res,nres) + int nd,nres; + double d[], res[]; +{ + double ave,fs,sig,ax,ay,fi; + int i,j,nc,it,k; + + ave = sig = 0.0; + + for(i=0; i= tmax) { + if (!ismax) num++; + ismax = 1; ismin = 0; + } else if (d[i] <= tmin) { + if (!ismin) num++; + ismax = 0; ismin = 1; + } + } + + return(1.0*nd/num); +} + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/aa/calclist.c b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/calclist.c new file mode 100644 index 0000000000000000000000000000000000000000..4e7a7779b1edf6ece4fb4ce3513663dcaf2b2f8c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/calclist.c @@ -0,0 +1,237 @@ +#include +#include +#include +#include +#include +/* #include */ +#include + +#include "stuff.h" +#ifdef cray +double asinh(double x) { return(log(sqrt(x*x+1.0) + x)); } +double acosh(double x) { return(log(sqrt(x*x-1.0) + x)); } +double atanh(double x) { return(0.5*log((1.0+x)/(1.0-x))); } +#endif + +#define pi 3.1415926535897929 + +char * getnumber(char *s,double *d); +double evallist(double d[],int dn, int prec); +void eval2(double *val1, double *val2, double d[],int dn); + + +static char *cmd, *cmd0; +/* This is modified by the calling program, to produce correct + * index + */ +int calclist_index = 0; + +/********************************************************** + * this evaluates the data arithmetic string + */ + +double +calclist(double d[],int dn, char *incmd) +{ + double val; + + /* save current number of calclist calls - return with #i */ + + cmd0 = cmd = incmd; + val = evallist(d,dn,0); + if (*cmd == 0) return(val); + if (*cmd == ')') { + fprintf(stderr,"Extra \')\' : %s\n",cmd0); + exit(0); + } + if (*cmd == ',') { + fprintf(stderr,"Extra \',\' : %s\n",cmd0); + exit(0); + } + fprintf(stderr,"Parser error: %s\n",cmd0); +} + + +double +evallist(double d[],int dn,int prec) +{ + int id; + char *sp; + double val,val2; + + /* get argument first */ + while (*cmd == ' ') cmd++; + if (*cmd == '(') { + cmd++; + val = evallist(d,dn,0); + if (*cmd != ')') { + fprintf(stderr,"Expecting \')\': \'%s\'\n",cmd0); + exit(0); + } + cmd++; + } else if (*cmd == '#') { + cmd++; + if (*cmd == 'i') { + /* print out index */ + val = calclist_index; + cmd++; + } else { + if ((cmd=getnumber(cmd,&val)) == NULL) { + fprintf(stderr,"Expecting 'i'/number after #: \'%s\'\n",cmd0); + exit(0); + } + id = val; + if (id > dn) { + fprintf(stderr,"#%d too large: max %d\n",id,dn); + exit(0); + } + val = d[id-1]; + } + } + else if ((sp=getnumber(cmd,&val)) != NULL) cmd = sp; + else { + /* now something else as ( or #; check for function name */ + if (strncmp(cmd,"sqrt(",5) == 0) { cmd+=5; val = sqrt(evallist(d,dn,0)); } + else if (strncmp(cmd,"abs(",4) == 0) { cmd+=4; val = fabs(evallist(d,dn,0)); } + else if (strncmp(cmd,"sin(",4) == 0) { cmd+=4; val = sin(evallist(d,dn,0)); } + else if (strncmp(cmd,"cos(",4) == 0) { cmd+=4; val = cos(evallist(d,dn,0)); } + else if (strncmp(cmd,"tan(",4) == 0) { cmd+=4; val = tan(evallist(d,dn,0)); } + else if (strncmp(cmd,"exp(",4) == 0) { cmd+=4; val = exp(evallist(d,dn,0)); } + else if (strncmp(cmd,"log(",4) == 0) { cmd+=4; val = log(evallist(d,dn,0)); } + else if (strncmp(cmd,"asin(",5) == 0) { cmd+=5; val = asin(evallist(d,dn,0)); } + else if (strncmp(cmd,"acos(",5) == 0) { cmd+=5; val = acos(evallist(d,dn,0)); } + else if (strncmp(cmd,"atan(",5) == 0) { cmd+=5; val = atan(evallist(d,dn,0)); } + else if (strncmp(cmd,"sinh(",5) == 0) { cmd+=5; val = sinh(evallist(d,dn,0)); } + else if (strncmp(cmd,"cosh(",5) == 0) { cmd+=5; val = cosh(evallist(d,dn,0)); } + else if (strncmp(cmd,"tanh(",5) == 0) { cmd+=5; val = tanh(evallist(d,dn,0)); } + else if (strncmp(cmd,"asinh(",6) == 0){ cmd+=6; val = asinh(evallist(d,dn,0)); } + else if (strncmp(cmd,"acosh(",6) == 0){ cmd+=6; val = acosh(evallist(d,dn,0)); } + else if (strncmp(cmd,"atanh(",6) == 0){ cmd+=6; val = atanh(evallist(d,dn,0)); } + else if (strncmp(cmd,"ceil(",5) == 0) { cmd+=5; val = ceil(evallist(d,dn,0)); } + else if (strncmp(cmd,"floor(",6) == 0){ cmd+=6; val = floor(evallist(d,dn,0)); } + else if (strncmp(cmd,"min(",4) == 0) { + cmd+=4; eval2(&val,&val2,d,dn); val = smaller(val,val2); } + else if (strncmp(cmd,"max(",4) == 0) { + cmd+=4; eval2(&val,&val2,d,dn); val = greater(val,val2); } + else if (strncmp(cmd,"pi",2) == 0) { + cmd+=2; val = pi; } + else { fprintf(stderr,"Unknown stuff: %s\n",cmd0); exit(0); } + + if (*cmd != ')') { + fprintf(stderr,"Expecting \')\' after the function name: \'%s\'\n",cmd0); + exit(0); + } + cmd++; + } + + /* now the operator */ + while (*cmd == ' ') cmd++; + + while (*cmd) { + switch (*cmd) { + case ',': return(val); + case ')': return(val); + case '^': + if (prec >= 5) return(val); + cmd++; + val = pow(val,evallist(d,dn,5)); + break; + case '*': + if (prec >= 4) return(val); + cmd++; + val *= evallist(d,dn,4); + break; + case '/': + if (prec >= 4) return(val); + cmd++; + val /= evallist(d,dn,4); + break; + case '%': + if (prec >= 4) return(val); + cmd++; + val = fmod(val,evallist(d,dn,4)); + break; + case '+': + if (prec >= 3) return(val); + cmd++; + val += evallist(d,dn,3); + break; + case '-': + if (prec >= 3) return(val); + cmd++; + val -= evallist(d,dn,3); + break; + + case '=': + if (cmd[1] != '=') { fprintf(stderr,"Expecting '==': %s\n",cmd0); exit(0); } + if (prec >= 2) return(val); + cmd += 2; + val = (val == evallist(d,dn,2)); + break; + case '<': + if (prec >= 2) return(val); + cmd++; + if (cmd[0] == '=') { + val = (val <= evallist(d,dn,2)); cmd++; + } else val = (val < evallist(d,dn,2)); + break; + case '>': + if (prec >= 2) return(val); + cmd++; + if (cmd[0] == '=') { + val = (val >= evallist(d,dn,2)); cmd++; + } else val = (val > evallist(d,dn,2)); + break; + case '!': + if (cmd[1] != '=') { fprintf(stderr,"Expecting '!=': %s\n",cmd0); exit(0); } + if (prec >= 2) return(val); + cmd += 2; + val = (val != evallist(d,dn,2)); + break; + + default: + fprintf(stderr,"Unknown operator: %s\n",cmd0); + exit(0); + } + } + return(val); +} + +/************************************************************** + * eval 2 arguments -- must be comma! + */ + +void +eval2(double *val1, double *val2, double d[],int dn) +{ + *val1 = evallist(d,dn,0); + if (*cmd != ',') { fprintf(stderr,"Expecting ',' : %s\n",cmd0); exit(0);} + cmd++; + *val2 = evallist(d,dn,0); +} + + +/************************************************************** + * hop number + */ +char * +getnumber(char *s,double *d) +{ + int dot = 0; + + if (sscanf(s,"%lg",d) != 1) return(NULL); + + while (*s == ' ' || *s == '\t') s++; + if (*s == '+' || *s == '-') s++; + if (*s == '.') { s++; dot = 1; } + while (*s <= '9' && *s >= '0') s++; + if ((!dot) && *s == '.') s++; + while (*s <= '9' && *s >= '0') s++; + if (*s == 'e' || *s == 'E') { + s++; + if (*s == '+' || *s == '-') s++; + while (*s <= '9' && *s >= '0') s++; + } + return(s); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/aa/dblarr.c b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/dblarr.c new file mode 100644 index 0000000000000000000000000000000000000000..c6281ea10193fd37e64e154da0c5233dbfc830ec --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/dblarr.c @@ -0,0 +1,48 @@ +#include +#include +#include +#include +#include +/* #include */ +#include + +/* #include */ + +#include "stuff.h" + +double * dblarr(int size) +{ + double * p; + + p = (double *)calloc(size,sizeof(double)); + if (p == NULL) { + fprintf(stderr," --- could not allocate double array of size %d\n",size); + exit(0); + } + return (p); +} + +float * fltarr(int size) +{ + float * p; + + p = (float *)calloc(size,sizeof(float)); + if (p == NULL) { + fprintf(stderr," --- could not allocate float array of size %d\n",size); + exit(0); + } + return (p); +} + + +int * intarr(int size) +{ + int * ip; + + ip = (int *)calloc(size,sizeof(int)); + if (ip == NULL) { + fprintf(stderr," --- could not allocate int array of size %d\n",size); + exit(0); + } + return(ip); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/aa/halt.c b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/halt.c new file mode 100644 index 0000000000000000000000000000000000000000..02a432d38189e5c06fa5cf5ac233c36f540e8c86 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/halt.c @@ -0,0 +1,16 @@ +#include +#include +#include +#include +#include +/* #include */ +#include + +int +halt(char *s,void *p) +{ + fprintf(stderr,s,p); + fprintf(stderr,"\n"); + exit(0); + return(0); /* just to get rid of an warning.. */ +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/aa/io_unformat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/io_unformat.c new file mode 100644 index 0000000000000000000000000000000000000000..b72c7cc17970ee3c16ded488d85648a3f0196221 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/io_unformat.c @@ -0,0 +1,292 @@ +/* + * UNFORMATTED IO SYSTEM + * Kari Rummukainen 1990 - 1999 + */ + + +#include +#include +#include +#include +#include +/* #include */ +#include + +#include "stuff.h" + + +static int inv_bytes,long_mode,block,dblock,lblock,fblock,cblock,iblock; +static double *dtmparr; +static float *ftmparr; +static long *ltmparr; +static int *itmparr; +static char *ctmparr; +static int msg = 0; + +#define l_h (sizeof(e_header)/sizeof(long)) + +typedef union { + e_header h; + long l[l_h]; +} h_union; + +typedef union { + i_header h; + int l[l_h]; +} i_union; + +#define ll_h (sizeof(ll_header)/sizeof(int)) + +typedef union { + ll_header h; + int l[ll_h]; +} ll_union; + + + +/************************************************** + * invert the byte ordering + */ + +long +swaplong(long a) +{ + union { + long l; + char c[sizeof(long)]; + } t1,t2; + int i; + + t1.l = a; + for (i=0; i= 2) { /* now long is longer than our long */ + fread(&ll.h,sizeof(ll_header),1,ff); + + for (k=0, j=( (long_mode == 2) ? 0 : 1 ); jlz < 1) h->lz = 1; + if (h->lt < 1) h->lt = 1; + + lblock = iblock = 0; + block = dblock = l.h.n_double; + if (dblock) dtmparr = dblarr(l.h.n_double); + block += fblock = l.h.n_float; + if (fblock) ftmparr = (float *)calloc(l.h.n_float,sizeof(float)); + block += cblock = l.h.n_char; + if (cblock) ctmparr = (char *)calloc(l.h.n_char,sizeof(char)); + + if (l.h.n_long) { + if (long_mode == 0) { + block += lblock = l.h.n_long; + ltmparr = (long *)calloc(l.h.n_long,sizeof(long)); + } else if (long_mode == 1) { + block += iblock = l.h.n_long; + itmparr = (int *)calloc(l.h.n_long,sizeof(int)); + } else if (long_mode >= 2) { + block += iblock = l.h.n_long; + itmparr = (int *)calloc(2*l.h.n_long,sizeof(int)); + } + } + + return (block); + +} + +int +skipheader(FILE *ff) +{ + e_header e; + i_header i; + + if (long_mode == 0) fread(&e,sizeof(e_header),1,ff); + else if (long_mode == 1) fread(&i,sizeof(i_header),1,ff); + else if (long_mode >= 2) { + fread(&i,sizeof(i_header),1,ff); + fread(&i,sizeof(i_header),1,ff); + } + return(1); +} + + +#define itmp_index(i) ((long_mode < 2) ? i : 2*i + long_mode-2) + +long +readdata(FILE *ff,double *arr) +{ + int ik,k,i; + long lk; + + if (dblock) fread(dtmparr,dblock,sizeof(double),ff); + if (fblock) fread(ftmparr,fblock,sizeof(float),ff); + if (lblock) fread(ltmparr,lblock,sizeof(long),ff); + if (iblock && long_mode < 2) fread(itmparr,iblock,sizeof(int),ff); + if (iblock && long_mode >= 2) fread(itmparr,2*iblock,sizeof(int),ff); + if (cblock) fread(ctmparr,cblock,sizeof(char),ff); + + k = 0; + if (long_mode == 1) { + fread(&ik,1,sizeof(int),ff); if (inv_bytes) lk = swapint(ik); else lk = ik; + } else if (long_mode == 0) { + fread(&lk,1,sizeof(long),ff); if (inv_bytes) lk = swaplong(lk); + } else { + int a[2]; + fread(&a,2,sizeof(int),ff); + if (inv_bytes) lk = swapint(a[long_mode-2]); + else lk = a[long_mode-2]; + } + + if (inv_bytes) { + for (i=0; i +#include +#include + +double fabs(double); + +#define verysmall(a,b) ((fabs(a)+b) - fabs(a) == 0) + +void eigsrt(int n,double d[],double *v); + +int jacobi(int n, double *ap,double d[],double *vp,int is_ordered) +{ + double *a,*b,*z,*v; + int ip,iq,i,j,nrot; + double sm,tresh,g,h,t,theta,tau,c,s; + + a = (double *)calloc(n*n,sizeof(double)); + v = (double *)calloc(n*n,sizeof(double)); + b = (double *)calloc(n,sizeof(double)); + z = (double *)calloc(n,sizeof(double)); + + for (i=0; i 4 && verysmall(d[ip],g) && verysmall(d[iq],g)) a[ip*n+iq] = 0; + else if (fabs(a[ip*n+iq]) > tresh) { + h = d[iq]-d[ip]; + if (verysmall(h,g)) t = a[ip*n+iq]/h; + else { + theta = 0.5*h/a[ip*n+iq]; + t = 1./(fabs(theta)+sqrt(1.0+theta*theta)); + if (theta < 0) t = -t; + } + c = 1./sqrt(1+t*t); + s = t*c; + tau = s/(1.0+c); + h = t*a[ip*n+iq]; + z[ip] -= h; z[iq] += h; + d[ip] -= h; d[iq] += h; + a[ip*n+iq] = 0; + for (j=0; j<=ip-1; j++) { + g = a[j*n+ip]; h = a[j*n+iq]; + a[j*n+ip] = g - s*(h + g*tau); + a[j*n+iq] = h + s*(g - h*tau); + } + for (j=ip+1; j<=iq-1; j++) { + g = a[ip*n+j]; h = a[j*n+iq]; + a[ip*n+j] = g - s*(h + g*tau); + a[j*n+iq] = h + s*(g - h*tau); + } + for (j=iq+1; j tresh) */ + } /* iq =ip+1..n-1 */ + } /* ip = 0..n-2 */ + + for (ip=0; ip= p) p = d[k = j]; + if (k != i) { + d[k] = d[i]; + d[i] = p; + for (j=0; j (y)) ? (x) : (y)) +#define smaller(x,y) (((x) < (y)) ? (x) : (y)) + +#define getoptnum(par1,par2,val){ \ + char *sp; \ + sp = ss; \ + if (!*ss && (argc-1)) ss = (argv+1)[0]; \ + if (sscanf(ss,par1,par2) != 1) { \ + *par2 = val; ss = sp; \ + } else { \ + if (!*sp && --argc) argv++; \ + ss = strchr(ss,0); \ + } \ +} + +#define getnum(par1,par2){ \ + if (!*ss && --argc) ss = (++argv)[0]; \ + if (sscanf(ss,par1,par2) != 1) { \ + fprintf(stderr,usage); \ + exit(-1); \ + } \ + ss = strchr(ss,0);} + +#define get2num(str,p1,p2){ \ + if (!(*ss) && --argc) ss = (++argv)[0]; \ + if (sscanf(ss,str,p1,p2) != 2) { \ + fprintf(stderr,usage); \ + exit(-1); \ + } \ + ss = strchr(ss,0);} + +#define get3num(str,p1,p2,p3){ \ + if (!(*ss) && --argc) ss = (++argv)[0]; \ + if (sscanf(ss,str,p1,p2,p3) != 3) { \ + fprintf(stderr,usage); \ + exit(-1); \ + } \ + ss = strchr(ss,0);} + +#define get4num(str,p1,p2,p3,p4){ \ + if (!(*ss) && --argc) ss = (++argv)[0]; \ + if (sscanf(ss,str,p1,p2,p3,p4) != 4) { \ + fprintf(stderr,usage); \ + exit(-1); \ + } \ + ss = strchr(ss,0);} + +#define get5num(str,p1,p2,p3,p4,p5){ \ + if (!(*ss) && --argc) ss = (++argv)[0]; \ + if (sscanf(ss,str,p1,p2,p3,p4,p5) != 5) { \ + fprintf(stderr,usage); \ + exit(-1); \ + } \ + ss = strchr(ss,0);} + + +#define getlist(v,i){ \ + if (!(*ss) && --argc) ss = (++argv)[0]; \ + i = 0; \ + while (*ss) { \ + v[i] = ss; \ + i++; \ + if (strchr(ss,';') == NULL) break; \ + ss = strchr(ss,';'); \ + *ss = 0; \ + ss++; \ + } \ + if (i <= 0) { \ + fprintf(stderr,usage); \ + exit(-1); \ + } \ + ss = strchr(ss,0);} + +#define getnumlist(v,i,format){ \ + if (!(*ss) && --argc) ss = (++argv)[0]; \ + i = 0; \ + while (*ss) { \ + if (sscanf(ss,format,&v[i++]) != 1) { \ + fprintf(stderr,usage); exit(-1); \ + } \ + if (strchr(ss,',') == NULL) break; \ + ss = strchr(ss,','); \ + ss++; \ + } \ + if (i <= 0) { \ + fprintf(stderr,usage); \ + exit(-1); \ + } \ + ss = strchr(ss,0);} + + +#define get1or3num(str1,str3,p1,p2,p3,n){ \ + if (!(*ss) && --argc) ss = (++argv)[0]; \ + if (sscanf(ss,str3,&p1,&p2,&p3) != 3) { \ + if (sscanf(ss,str1,&p1) != 1) { \ + fprintf(stderr,usage); \ + exit(-1); \ + } \ + n = 1; \ + p3 = p1; p2 = 1; \ + } else n = 3; \ + ss = strchr(ss,0);} + + + +/********* headers for unformatted io **********/ + +typedef struct { + int headerid,f1,headersize,f2; + int n_double,f3,n_long,f4,n_int,f5,n_char,f6; + int lx,f7,ly,f8,lz,f9,lt,f10; + int d1,f11,d2,f12,d3,f13,d4,f14,d5,f15,d6,f16,d7,f17,d8,f18; +} ll_header; + +typedef struct { + long headerid,headersize; + long n_double,n_long,n_float,n_char; + long lx,ly,lz,lt; + long d1,d2,d3,d4,d5,d6,d7,d8; +} e_header; + +typedef struct { + int headerid,headersize; + int n_double,n_long,n_float,n_char; + int lx,ly,lz,lt; + int d1,d2,d3,d4,d5,d6,d7,d8; +} i_header; + +#define E_HEADER_ID 91919191 + +/* and a couple of protos */ +int readheader(FILE *ff,e_header *h); +int skipheader(FILE *ff); +long readdata(FILE *ff,double *tmparr); + + +/* other prototypes */ + +double calclist(double d[],int dl,char *cmd); +double * dblarr(int size);float * fltarr(int size); +int * intarr(int size); +double confidence(double chisq,int dof); +int gaussj(double* a,int n, int np, double* b,int m); +int svdecomp(double *a, int n, double *b, int m); +void fitfun(double *x,double *y,double *sig,int ndata,double *a,int ma, + double *covar,int *lista,int mfit,double *chisq,int print, + double funcs()); +void covarfit(double *x,double *y,double *cmat,int ndata,double *a,int ma, + double *covar,int *lista,int mfit,double *chisq,int print, + double funcs()); +void jackfit(double *x,double *y,double * sig,int ndata,int n1, int n2, + int jack,double *a,int ma, + double *covar,int *lista,int mfit,double *chisq,int print, + int fullcov,int simplex,double funcs()); +void jack_fit(double *x,double *y,double * sig,int ndata,int n1, int n2, + int jack,double *a,int ma, + double *covar,int *lista,int mfit,double *chisq,int print, + int fullcov,int simplex,double funcs(),double *av); +double fitfun_s(double *x,double *y,double *sig,int ndata,double *a,int ma, + double funcs()); +double brent(double ax,double bx,double cx,double f(),double tol,double *xmin); +double polyfit(int ndata,double x[],double y[],double sig[], + int deg,double par[],double ep[]); +double nelder(int ndim,double p[],double ftol,double funk(),int *i); +void simplexfit(double *x,double *y,double *sig,int ndata,double *a,int ma, + int *lista,int mfit,double *chisq,int print,double funcs()); +int jacobi(int n, double *ap,double d[],double *v,int is_ordered); +double simpson(double x[], double y[], double res[], int r); + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/compile.sh.in b/qcd/part_cpu/applications/QCD/src/kernel_B/compile.sh.in new file mode 100644 index 0000000000000000000000000000000000000000..89084ebbd9b55e3c405dd4a3392801b129c4c465 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/compile.sh.in @@ -0,0 +1,34 @@ +#!/bin/bash -l +############################################################################# +### ### +### Compile script for SU3 ### +### ### +### Last modified: 2008-09-08 ### +### ### +############################################################################# + +export MAKE="#MAKE#" +export EXECNAME="#EXECNAME#" +export AAPROG="#AAPROG#" +export RM="#RM#" +export AR="#AR#" +export ARFLAGS="#ARFLAGS#" +export CC="#CC#" +export CFLAGS="#CFLAGS#" +export MPI_CC="#MPI_CC#" +export LDFLAGS="#LDFLAGS#" + +#MODULE_INIT# +#MODULE_CMD# #MODULE_FILES# + +cd libraries/ && $MAKE all \ +&& cd ../su3h_n/ && $MAKE su3_ahiggs \ +&& cp su3_ahiggs $EXECNAME \ +&& cd ../ + +#MODULE_INIT2# +#MODULE_CMD2# #MODULE_FILES2# + +cd aa/ && $MAKE \ +&& cp aa $AAPROG \ +&& cd ../ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_complex.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_complex.c new file mode 100644 index 0000000000000000000000000000000000000000..1b74f279e3f609443e2c925f2b62d75681ae4f7a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_complex.c @@ -0,0 +1,10 @@ +/* block_field_complex -- does blocking on u1 higgs scalar field + * Kari Rummukainen 2002 + */ + +#include LATDEF + +#define FIELD complex +#define block_FIELD block_field_complex + +#include "block_field_generic.c" diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_generic.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_generic.c new file mode 100644 index 0000000000000000000000000000000000000000..d590598d58eab86da0dec95d2aedab234d38da25 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_generic.c @@ -0,0 +1,42 @@ +/********************************************************* + * Generic field blocking routine for SU(2) + * Kari Rummukainen 1998-2002 + */ + +FIELD * block_FIELD(FIELD *f, int newlev[NDIM], int free_old) +{ + /* Block the field(s) + */ + int dir,i,j,off,x[NDIM],step[NDIM],oldlev[NDIM]; + node_struct oldnode; + FIELD *th; + + /* first, catch the current node */ + oldnode = node; + + /* calculate the relative change */ + foralldir(dir) step[dir] = newlev[dir] - + (oldlev[dir] = current_blocking_level[dir]); + + /* set new block */ + set_blocking_level( newlev ); + + th = new_latfield( FIELD ); + + /* and loop over */ + off = 0; + forallsites(i) { + foralldir(dir) x[dir] = (coordinate(i,dir) << step[dir]); + + /* index to the corresponding site */ + th[i] = f[ node_index( x, &oldnode ) ]; + + } + + set_blocking_level( oldlev ); + + if (free_old) free_latfield( f ); + + return( th ); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_su3adjoint.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_su3adjoint.c new file mode 100644 index 0000000000000000000000000000000000000000..c0069406652861e957fbdcf932dd647056018deb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_su3adjoint.c @@ -0,0 +1,10 @@ +/* block_su3_adjoint -- does blocking on su3 adjoint scalar field + * Kari Rummukainen 2002 + */ + +#include LATDEF + +#define FIELD adjoint_matrix +#define block_FIELD block_field_su3adjoint + +#include "block_field_generic.c" diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_lattice.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_lattice.c new file mode 100644 index 0000000000000000000000000000000000000000..5ff98245d3277db9eaa74eb7493ced40a0bd1793 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_lattice.c @@ -0,0 +1,199 @@ +/************************************************************************* + * Block lattice operations + */ + +#include "comdefs.h" +#include "generic.h" + +/* set_blocking_leve( int blev[ndim] ) sets the global structures + * for reducing the current lattice size by a factors given in blev, + * so that, for example, nx -> nx/2^blev[XUP]. + * Affects ONLY the gather, forallsites, etc. operations, does not + * touch individual variables. + * + * int * make_block_map( int blev[ndim] ) makes a index array from the + * current_blocking_level to blocking level blev. Thus, it returns + * an array of size current lattice size. It can go up or down, + * if up it fills the illegal sites with -(1<<30). + */ + +/* static variables for the blocking operations */ + +typedef struct block { + int level[NDIM]; /* blocking levels */ + struct block * next; /* ptr to next block level */ + lattice_struct lattice; /* saved lattice structure */ + node_struct node; /* saved node structure */ + node_struct * allnodes; /* ptr to node array for all nodes */ + site_struct * site; /* ptr to site struct */ + comlist_struct * comlist; /* and comlist for the gather */ + int *neighb[NDIRS]; /* neighbour arrays */ +} block_buf; + +static block_buf * b_buf = NULL; /* pointer to all of the saved blocking vars */ +static block_buf * base; /* will point to the base level */ +static int n_blocks = 0; /* number of blocking levels */ + +extern comlist_struct *comlist; /* ptr to comlist */ +extern node_struct *allnodes; + +void set_blocking_level( int blev[NDIM] ) +{ + int dir,i,found,d; + block_buf *b,*p; + + /* check first if it is the current level, nothing to do */ + found = 1; + foralldir(dir) + if (current_blocking_level[dir] != blev[dir]) found = 0; + if (found) return; + + /* Now something is happening */ + + if (b_buf == NULL) { + /* first time in, save the basic level here */ + base = b_buf = (block_buf *)memalloc(1, sizeof(block_buf) ); + foralldir(dir) base->level[dir] = 0; + for (dir=0; dirneighb[dir] = neighb[dir]; + + base->lattice = lattice; /* copy the lattice struct */ + base->node = node; /* copy the node struct */ + base->allnodes= allnodes; /* copy the address of the node array */ + base->site = site; /* and the address of the site array too */ + base->comlist = comlist; /* and comlist */ + base->next = NULL; + } + + /* ok, now chase the list and check if blev is defined */ + + found = 0; + for (b=b_buf; (!found) && b != NULL ; b=b->next) { + found = 1; + foralldir(dir) found = (found && (b->level[dir] == blev[dir])); + p = b; + } + + if (found) { + /* now the gather was found, copy the arrays */ + + lattice = p->lattice; + node = p->node; + allnodes = p->allnodes; + site = p->site; + comlist = p->comlist; + foralldir(dir) current_blocking_level[dir] = p->level[dir]; + for (dir=0; dirneighb[dir]; + + /* printf(" blocking level %d %d %d set up\n", + blev[XUP], blev[YUP], blev[ZUP]); + */ + fflush(stdout); + + } else { + + /* now it was NOT found -- make new level */ + + p->next = b = (block_buf *)memalloc(1, sizeof(block_buf) ); + + b->lattice.volume = 1; + foralldir(dir) { + current_blocking_level[dir] = b->level[dir] = blev[dir]; + + /* check if this is at all legal */ + if (base->lattice.size[dir] % (1 << blev[dir])) { + printf("Blocking error: cannot divide lattice by factors "); + foralldir(d) { + printf("%d",blev[d]); + if (d > 0) printf(" x "); + } + printf("\n"); + halt("Blocking error"); + } + + b->lattice.size[dir] = (base->lattice.size[dir]) / (1 << blev[dir]); + b->lattice.volume *= b->lattice.size[dir]; + + } + + lattice = b->lattice; + + /* make the structures -- THIS INITIALIZES THE ARRAYS */ + + /* printf(" blocking level %d %d %d set up, making arrays\n", + * blev[XUP], blev[YUP], blev[ZUP]); + */ + + make_lattice_arrays( &(b->lattice) ); + + /* and copy now the stuff to buffer */ + for (dir=0; dirneighb[dir] = neighb[dir]; + + b->node = node; /* copy the node struct */ + b->allnodes= allnodes; /* copy the address of the node array */ + b->site = site; /* and the address of the site array too */ + b->comlist = comlist; /* comlist */ + b->next = NULL; + + } +} + + +/****************************************************************/ + +void set_blocking_all( int lev ) +{ + int b[NDIM],dir; + + foralldir(dir) b[dir] = lev; + set_blocking_level( b ); +} + + +/************************************************************** + * Make mapping from current to blev + */ + +int *make_blocking_map( int newlev[NDIM] ) +{ + int dir,i,is_ok,x[NDIM],step[NDIM],oldlev[NDIM]; + node_struct newnode; + int *map; + + /* allocate map */ + map = (int *)memalloc( node.sites, sizeof(int) ); + + /* calculate the relative change */ + foralldir(dir) step[dir] = newlev[dir] - + (oldlev[dir] = current_blocking_level[dir]); + + /* set new blocking */ + set_blocking_level( newlev ); + /* catch the current node */ + newnode = node; + /* and just reset the level */ + set_blocking_level( oldlev ); + + /* and loop over */ + + forallsites(i) { + is_ok = 1; + foralldir(dir) { + if (step[dir] == 0) { + /* no change, coordinate as is */ + x[dir] = coordinate(i,dir); + } else if (step[dir] < 0) { + /* down, new is 2^n denser/larger than the old */ + x[dir] = (coordinate(i,dir) << (-step[dir])); + } else { + /* up, new is smaller than old */ + /* if (coordinate(i,dir) % (1<> step[dir]; + } + } + if (is_ok) map[i] = node_index( x, &newnode ); + else map[i] = -(1<<30); + + } + return( map ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_complex.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_complex.c new file mode 100644 index 0000000000000000000000000000000000000000..ee428e704af7b1a0157ce3d9c268ed4e0b9b2278 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_complex.c @@ -0,0 +1,9 @@ +/****** block_link_complex.c -- compute the blocked link ******************/ + +/* MIMD version 3 */ + +#include LATDEF + +#define block_link_MATRIX block_link_complex + +#include "block_link_generic.c" diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_generic.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_generic.c new file mode 100644 index 0000000000000000000000000000000000000000..4f10469f8e69545c99268c2d0396fd2820f54caa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_generic.c @@ -0,0 +1,68 @@ +/********************************************************* + * Generic gauge field blocking routine + * Kari Rummukainen 1998 - 2002 + */ + +void block_link_MATRIX( MATRIX *oldl[NDIM], MATRIX *newl[NDIM], + int newlev[NDIM], int free_old ) +{ + /* Just multiply the straight links U(x) U(x+1) -> U(x) + */ + + int i,j,dir; + int x[NDIM],step[NDIM],oldlev[NDIM]; + int *nf[NDIM]; + MATRIX *tmpl[NDIM]; + msg_tag *tag[NDIM]; + node_struct oldnode; + + /* first, start XYZ-direction: move the link to ->staple */ + + foralldir(dir) { + oldlev[dir] = current_blocking_level[dir]; + step[dir] = newlev[dir] - oldlev[dir]; + if (step[dir] == 1) { + tag[dir] = start_get( oldl[dir], dir, EVENODD ); + } else if (0 != step[dir]) halt(" Gauge blocking error" ); + } + + foralldir(dir) { + /* wait the gathers, this clears the buffers */ + if (step[dir]) wait_get(tag[dir]); + /* grab the old neighbour arrays */ + nf[dir] = neighb[dir]; + } + + /* copy the node, needed */ + oldnode = node; + + /* block the system */ + set_blocking_level( newlev ); + + foralldir(dir) tmpl[dir] = new_latfield( MATRIX ); + + /* and loop over */ + forallsites(i) { + foralldir(dir) x[dir] = (coordinate(i,dir)) << step[dir]; + /* index to the corresponding site */ + j = node_index( x, &oldnode ); + + /* and mult */ + foralldir(dir) { + if (step[dir]) { + mult_MATRIX_nn( oldl[dir][j], oldl[dir][nf[dir][j]], tmpl[dir][i] ); + } else { + tmpl[dir][i] = oldl[dir][j]; + } + } + } + + /* restore old level */ + set_blocking_level( oldlev ); + + if (free_old) foralldir(dir) free_latfield( oldl[dir] ); + /* set the pointer last - this makes it possible to use same + link in and out, if the old is freed first */ + foralldir(dir) newl[dir] = tmpl[dir]; + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_su2.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_su2.c new file mode 100644 index 0000000000000000000000000000000000000000..d44235e56760ea8f1930cba7a62b314eba414f19 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_su2.c @@ -0,0 +1,9 @@ +/****** block_link_su2.c -- compute the blocked link ******************/ + +/* MIMD version 3 */ + +#include LATDEF +#include "generic_su2.h" + +#define block_link_MATRIX block_link_su2 +#include "block_link_generic.c" diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_su3.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_su3.c new file mode 100644 index 0000000000000000000000000000000000000000..da30c2dcd34cbd04427f4b0ae17a6b89058783ea --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_su3.c @@ -0,0 +1,8 @@ +/****** block_link_su3.c -- compute the blocked link ******************/ + +/* MIMD version 3 */ + +#include LATDEF + +#define block_link_MATRIX block_link_su3 +#include "block_link_generic.c" diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/bulk_update_mpi.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/bulk_update_mpi.c new file mode 100644 index 0000000000000000000000000000000000000000..6dbf17272d5ec4d72b5ea599ffa21c1ccbdbd1d3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/bulk_update_mpi.c @@ -0,0 +1,21 @@ +/****************************************************************** + * here some MPI typical "bulk update" subroutines + */ + +#include "comdefs.h" +#include "generic.h" + +/* Is the link inside node? Thus, OK if the link is not _along_ any + * of the bottom slabs of the node. For example, reject + * x-links where y,z,.. coordinate == min on the node + */ + +int inside_node(int i, int dir) +{ + register int d,s; + + s = 1; + foralldir( d ) if ( d != dir && coordinate(i,d) == node.xmin[d] ) s = 0; + return( s ); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_mpi.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_mpi.c new file mode 100644 index 0000000000000000000000000000000000000000..166748a2e2b5c6ebad48edcb49d3c2b5d7916d68 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_mpi.c @@ -0,0 +1,937 @@ +/****************** com_mpi.c ***************************************** + * + * Communications routines, for MPI interface + * Modified from the MILC lattice QCD one + * KR 2001 + * + + + + g_sync() provides a synchronization point for all nodes. + g_floatsum() sums a floating point number over all nodes. + g_doublesum() sums a double over all nodes. + g_vecdoublesum() sums a vector of doubles over all nodes. + g_floatmax() finds maximum of a floating point number over all nodes. + g_doublemax() finds maximum of a double over all nodes. + broadcast_float() broadcasts a single precision number from + node 0 to all nodes. + broadcast_double() broadcasts a double precision number + send_integer() sends an integer to one other node + receive_integer() receives an integer + terminate() kills the job on all processors + + start_gather() starts asynchronous sends and receives required + to gather neighbors. + wait_gather() waits for receives to finish, insuring that the + data has actually arrived. + + start_scatter() invese of gather + wait_scatter() + + send_field() sends a field to one other node. + receive_field() receives a field from some other node. +*/ + +/* load in definitions, variables etc. */ + +#include "comdefs.h" +#include "generic.h" +#include "timers.h" /* includes comm timer calculators */ + +#ifdef TIMERS +static timer_type total_gather_timer, start_gather_timer, wait_send_timer, + wait_receive_timer, g_sync_timer, + g_sum_timer, broadcast_timer, send_timer, total_time; +#endif +static double total_sent_data = 0.0, total_gather_data = 0.0; + + +#define MIN_GATHER_INDEX 100 +#define MAX_GATHER_INDEX 7100 /* allows 7000 concurrent gathers */ +#define FIELD_TYPE 11 /* used in send/receive field */ +#define SEND_INTEGER_TYPE 12 /* used in send/receive int */ + +#define SEND_FLAG 1 /* flags for marking msg_tags */ +#define RECEIVE_FLAG 2 + +extern comlist_struct *comlist; /* the comlist variables in layout.c */ + +/************************************************************************/ + +/* get all msg_tags in a single array, avoid allocating + * small bits and pieces of messages + */ + +#define N_MSG_TAG 50 + +static msg_tag msg_tag_arr[N_MSG_TAG]; +static msg_tag *msg_tag_free; + +void init_msg_tags() +{ + int i; + + for (i=0; inext; + } + + /* p points to the last in the list */ + msg_tag_free = p->next; + p->next = NULL; + return(r); +} + + +void release_msg_tags(msg_tag *tp) +{ + msg_tag *p; + + for (p=tp; p->next != NULL; p=p->next) ; + p->next = msg_tag_free; + msg_tag_free = tp; +} + +/************************************************************************/ + + +/* Machine initialization */ +#include +void initialize_machine() { +/* MPI_Init(&argc,&argv); */ + +#ifdef TIMERS + timer_reset( &total_time ); + timer_start( &total_time ); + + timer_reset( &total_gather_timer ); + timer_reset( &start_gather_timer ); + timer_reset( &wait_send_timer ); + timer_reset( &wait_receive_timer ); + timer_reset( &g_sync_timer ); + timer_reset( &g_sum_timer ); + timer_reset( &broadcast_timer ); + timer_reset( &send_timer ); +#endif + + init_msg_tags(); + +} + + +/************************************************************************/ + +/* this formats the wait_array, used by forallsites_waitA() + * should be made as fast as possible! + * + * wait_array[i] contains a bit at position 1<= node.sites ) wait_arr_[i] = wait_arr_[i] | (1<= node.sites ) wait_arr_[i] = wait_arr_[i] | (1< 4 + halt("forallsites_waitA requires NDIM <= 4!\n"); +#endif + + if (ntag > NA_MAX) halt("Error in forallsites_waitA: too many gathers"); + mask = 0; + for (i=0; idir)); + } + return( mask ); +} + + +#ifdef OLD_WAIT_ARR +int setup_wait_arr( unsigned char *wait_arr, msg_tag* tag_out[], + msg_tag* tag_in[], int ntag ) +{ + register int i,j; + int nt,dir[NA_MAX]; + + if (ntag > NA_MAX) halt("Error in forallsites_waitA: too many gathers"); + for (nt=i=0; idir; + nt++; + } + tag_out[nt] = (msg_tag *)NULL; /* needed for the last loop */ + + forallsites(i) { + wait_arr[i] = 0; /* basic, no wait */ + for (j=0; j= node.sites ) wait_arr[i] = j+1; + } + return( nt ); +} +#endif + + + + +/************************************************************************/ +/* GATHER ROUTINES */ +/* start_gather() returns a pointer to a list of msg_tag's, which + be used as input to subsequent wait_gather(). + + This list contains msg_tags for all receive buffers, followed by + end flag. + + If no messages at all are required, the routine will return NULL. + msg_buf==NULL should be a reliable indicator of no message. + + usage: tag = start_gather( source, size, direction, parity ) +*/ + + + +msg_tag* start_gather( field, size, dir, parity ) + /* arguments */ + char * field; /* pointer to some latfield */ + int size; /* size in bytes of the field (eg sizeof(su3_vector))*/ + int dir; /* direction to gather from. eg XUP - index into + neighbor tables */ + int parity; /* parity of sites whose neighbors we gather. + one of EVEN, ODD or EVENODD (EVEN+ODD). */ +{ + /* local variables */ + int i,j,k; /* scratch */ + int offset; /* number of sites in this receive or send */ + int *idx; /* index array pointer */ + int nsites; + char *tpt; /* temp ptr to buffer */ + msg_tag *mbuf, *mp; /* list of message tags, to be returned */ + comlist_struct *cp; + send_struct *sp; + receive_struct *rp; + static int index = MIN_GATHER_INDEX; /* index to identify the operation */ + + + if (dir < 0 || dir >= NDIRS) { + printf("No such gather %d, node %d\n",dir,mynode()); + terminate(1212); + } + + /* First, get the rolling index for the operation + * This MUST BE HERE even if there's nothing to do, because + * somebody else might be doing this! + */ + ++index ; if (index > MAX_GATHER_INDEX) index = MIN_GATHER_INDEX; + + cp = &comlist[dir]; + + /* Now if there's nothing to do, return - CHECK IF GATHERED */ + if( ( cp->n_send == 0 && cp->n_receive == 0 ) + || is_already_gathered( field, size, dir, parity) ) + return( (msg_tag *) NULL ); + + /* mark gathered, if needed */ + gather_mark_gathered( field, size, dir, parity ); + + /* allocate a buffer for the msg_tags. This is dynamically allocated + because there may be an arbitrary number of gathers in progress + in any direction. SIZE = n_msgs sent+received, for end flag + */ + + mbuf = get_msg_tags(cp->n_send + cp->n_receive); + +#ifdef TIMERS + /* mark start time for this gather */ + mbuf->start_time = timer_start( &start_gather_timer ); +#endif + + mp=mbuf; + /* HANDLE RECEIVES: loop over nodes which will send here */ + for (i=0, rp=cp->from_node; in_receive;i++, rp=rp->next, mp=mp->next) { + /* note--neighbors of EVEN sites are always first in the list! + * Thus, for ODD sites we must change the offset + */ + switch (parity) { + case EVEN: nsites = rp->n_even; offset = rp->offset; break; + case ODD: nsites = rp->n_odd; offset = rp->offset + rp->n_even; break; + case EVENODD: nsites = rp->n; offset = rp->offset; break; + } + + mp->flag = RECEIVE_FLAG; /* flag as normal receive */ + mp->dir = dir; + /* and post receive -- comes right on spot */ + MPI_Irecv( ((char *)field) + offset*size, nsites*size, MPI_BYTE, + rp->node, index, MPI_COMM_WORLD, &(mp->mpi) ); + + total_gather_data += nsites*size; + + } + + /* HANDLE SENDS - note: mp automatically correct */ + for(k=0,sp=cp->to_node; k < cp->n_send; k++,sp = sp->next, mp = mp->next) { + switch (parity) { + case EVEN: nsites = sp->n_even; offset = 0; break; + case ODD: nsites = sp->n_odd; offset = sp->n_even; break; + case EVENODD: nsites = sp->n; offset = 0; break; + } + + /* allocate buffer */ + tpt = (char *)malloc( nsites*size ); + if(tpt==NULL){printf("NO ROOM for tpt, node %d\n",mynode());exit(1);} + mp->flag = SEND_FLAG; /* flag as send */ + mp->dir = dir; + mp->buf = tpt; + /* gather data into the buffer */ + + idx = sp->sitelist + offset; /* initial offset */ + for (j=0; jbuf, nsites*size, MPI_BYTE, + sp->node, index, MPI_COMM_WORLD, &(mp->mpi) ); + + total_gather_data += nsites*size; + + } + + timer_end( &start_gather_timer ); + + /* return */ + return(mbuf); +} + + +msg_tag * wait_gather( msg_tag *mbuf ) +{ + MPI_Status status; + msg_tag *mp; + + if (mbuf == NULL) return((msg_tag *)NULL); + timer_start( &wait_receive_timer ); + /* wait for all receive messages */ + for(mp=mbuf; mp != NULL && mp->flag == RECEIVE_FLAG; mp=mp->next) { + MPI_Wait( &(mp->mpi), &status ); + } + timer_end( &wait_receive_timer ); + /* wait for all send messages */ + timer_start( &wait_send_timer ); + for( ; mp != NULL && mp->flag == SEND_FLAG; mp=mp->next) { + MPI_Wait( &(mp->mpi), &status ); + /* release the buffer */ + free( mp->buf ); + } +#ifdef TIMERS + total_gather_timer.total += + timer_end( &wait_send_timer ) - mbuf->start_time; + total_gather_timer.count ++; +#endif + + /* and free the mbuf */ + release_msg_tags( mbuf ); + return((msg_tag *)NULL); +} + + +void wait_gather_arr( msg_tag* tag[], int ntag ) +{ + int i; + for (i=0; i field[i_otherparity], on + * neighb. nodes. + * THIS MODIFIES THE LATTICE FIELD field ON OTHERPARITY. + * Thus, there is little sense using this on EVENODD (but it is possible) + */ + +msg_tag * start_scatter( field, size, dir, parity ) + char * field; /* pointer to some latfield */ + int size; /* size in bytes of the field (eg sizeof(su3_vector))*/ + int dir; /* direction to push the data, eg XUP - index into + neighbor tables */ + int parity; /* parity of sites from where we push + one of EVEN, ODD or EVENODD (EVEN+ODD). */ +{ + /* local variables */ + int i,k; /* scratch */ + int offset; /* number of sites in this receive or send */ + int nsites; + char *tpt; /* temp ptr to buffer */ + msg_tag *mbuf, *mp; /* list of message tags, to be returned */ + comlist_struct *cp; + send_struct *sp; + receive_struct *rp; + static int index = MIN_GATHER_INDEX; /* index to identify the operation */ + + + if (dir < 0 || dir >= NDIRS) { + printf("No such gather %d, node %d\n",dir,mynode()); + terminate(1212); + } + + /* First, get the rolling index for the operation + * This MUST BE HERE even if there's nothing to do, because + * somebody else might be doing this! + */ + ++index ; if (index > MAX_GATHER_INDEX) index = MIN_GATHER_INDEX; + + cp = &comlist[dir]; + + /* Now if there's nothing to do, return */ + if( cp->n_send == 0 && cp->n_receive == 0 ) + return( (msg_tag *) NULL ); + + /* allocate a buffer for the msg_tags */ + + mbuf = get_msg_tags(cp->n_send + cp->n_receive); + +#ifdef TIMERS + /* mark start time for this gather */ + mbuf->start_time = timer_start( &start_gather_timer ); +#endif + + mp=mbuf; + /* HANDLE RECEIVES: loop over nodes which will send here + * note difference to start_gather; now using to_node, n_send! + */ + for(k=0,sp=cp->to_node; k < cp->n_send; k++,sp = sp->next, mp=mp->next) { + switch (parity) { + case EVEN: nsites = sp->n_even; offset = 0; break; + case ODD: nsites = sp->n_odd; offset = sp->n_even; break; + case EVENODD: nsites = sp->n; offset = 0; break; + } + + /* allocate buffer */ + tpt = (char *)malloc( nsites*size ); + if(tpt==NULL){printf("NO ROOM for tpt, node %d\n",mynode());exit(1);} + mp->flag = RECEIVE_FLAG; /* flag as receive */ + mp->dir = opp_dir(dir); /* using opp_dir here, works with _wait etc. */ + mp->buf = tpt; + + mp->nsites = nsites; + mp->size = size; + mp->field = field; + mp->sitelist = sp->sitelist + offset; /* list of sites to scatter */ + + /* post receive */ + MPI_Irecv( mp->buf, nsites*size, MPI_BYTE, + sp->node, index, MPI_COMM_WORLD, &(mp->mpi) ); + + total_gather_data += nsites*size; + } + + /* and HANDLE SENDS - note: mp automatically correct */ + for (i=0, rp=cp->from_node; in_receive;i++, rp=rp->next, mp=mp->next) { + /* note--EVEN sites are always first in the list! + * Thus, for ODD sites we must change the offset + */ + switch (parity) { + case EVEN: nsites = rp->n_even; offset = rp->offset; break; + case ODD: nsites = rp->n_odd; offset = rp->offset + rp->n_even; break; + case EVENODD: nsites = rp->n; offset = rp->offset; break; + } + + mp->flag = SEND_FLAG; /* flag as normal receive */ + mp->dir = opp_dir(dir); + /* and post send -- comes right from spot */ + MPI_Issend( ((char *)field) + offset*size, nsites*size, MPI_BYTE, + rp->node, index, MPI_COMM_WORLD, &(mp->mpi) ); + + total_gather_data += nsites*size; + } + timer_end( &start_gather_timer ); + + /* return */ + return(mbuf); +} + + +msg_tag * wait_scatter( msg_tag *mbuf ) +{ + MPI_Status status; + msg_tag *mp; + int *idx,j; + char *tpt; + + if (mbuf == NULL) return((msg_tag *)NULL); + timer_start( &wait_receive_timer ); + /* wait for all receive messages */ + for(mp=mbuf; mp != NULL && mp->flag == RECEIVE_FLAG; mp=mp->next) { + MPI_Wait( &(mp->mpi), &status ); + + /* now copy data to right spot */ + idx = mp->sitelist; /* index list */ + tpt = mp->buf; + for (j=0; jnsites; j++, tpt += mp->size) { + memcpy( mp->field + idx[j]*mp->size, tpt, mp->size ); + } + /* and free the field */ + free( mp->buf ); + } + timer_end( &wait_receive_timer ); + /* wait for all send messages */ + timer_start( &wait_send_timer ); + for( ; mp != NULL && mp->flag == SEND_FLAG; mp=mp->next) { + MPI_Wait( &(mp->mpi), &status ); + } +#ifdef TIMERS + total_gather_timer.total += + timer_end( &wait_send_timer ) - mbuf->start_time; + total_gather_timer.count ++; +#endif + /* and free the mbuf */ + release_msg_tags( mbuf ); + return((msg_tag *)NULL); +} + + +/**************************************************************** + */ + + +/* SEND AND RECEIVE FIELD */ +/* send_field is to be called only by the node doing the sending */ +/* get_field is to be called only by the node to which the field was sent */ +void send_field(buf,size,tonode) + void *buf; int size,tonode; +{ + timer_start( &send_timer ); + MPI_Send(buf,size,MPI_BYTE,tonode,FIELD_TYPE,MPI_COMM_WORLD); + timer_end( &send_timer ); + total_sent_data += size; +} +void receive_field(buf,size) + void *buf; int size; +{ + MPI_Status status; + + timer_start( &send_timer ); + MPI_Recv(buf,size,MPI_BYTE,MPI_ANY_SOURCE,FIELD_TYPE, + MPI_COMM_WORLD,&status); + timer_end( &send_timer ); + total_sent_data += size; +} + +/* BASIC COMMUNICATIONS FUNCTIONS */ + +/* Tell what kind of machine we are on */ +static char name[]="MPI (portable)"; +char * machine_type(){ + return(name); +} + +/* Return my node number */ +int mynode() +{ + int node; + MPI_Comm_rank( MPI_COMM_WORLD, &node ); + return(node); +} + +/* Return number of nodes */ +int numnodes() +{ + int nodes; + MPI_Comm_size( MPI_COMM_WORLD, &nodes ); + return(nodes); +} + +/* Synchronize all nodes */ +void g_sync() +{ + timer_start( &g_sync_timer ); + MPI_Barrier( MPI_COMM_WORLD ); + timer_end( &g_sync_timer ); +} + +/* Sum float over all nodes. dist=1: distribute to all nodes */ +void g_floatsum( float * fpt, int dist ) +{ + float work; + + timer_start( &g_sum_timer ); + if (dist) { + MPI_Allreduce( fpt, &work, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD ); + *fpt = work; + } else { + MPI_Reduce ( fpt, &work, 1, MPI_FLOAT, MPI_SUM, 0 , MPI_COMM_WORLD ); + if (this_node == 0) *fpt = work; + } + timer_end( &g_sum_timer ); +} + +/* Sum double over all nodes, and scatter the result */ +void g_doublesum( double * dpt, int dist ) +{ + double work; + + timer_start( &g_sum_timer ); + if (dist) { + MPI_Allreduce( dpt, &work, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); + *dpt = work; + } else { + MPI_Reduce ( dpt, &work, 1, MPI_DOUBLE, MPI_SUM, 0 , MPI_COMM_WORLD ); + if (this_node == 0) *dpt = work; + } + timer_end( &g_sum_timer ); +} + +#define N_ELEM 100 + +/* Sum a vector of ints over all nodes */ +void g_vecintsum( int *dpt, int n, int dist) +{ + int *work, arr[N_ELEM]; + register int i; + + timer_start( &g_sum_timer ); + if (n <= N_ELEM) work = arr; else work = (int *)malloc(n*sizeof(int)); + if (dist) { + MPI_Allreduce( dpt, work, n, MPI_INT, MPI_SUM, MPI_COMM_WORLD ); + for (i=0; i N_ELEM) free(work); + timer_end( &g_sum_timer ); +} + + +/* Sum a vector of floats over all nodes */ +void g_vecfloatsum( float *dpt, int nfloats, int dist ) +{ + float *work, arr[N_ELEM]; + register int i; + + timer_start( &g_sum_timer ); + + if (nfloats <= N_ELEM) work = arr; + else work = (float *)malloc(nfloats*sizeof(float)); + + if (dist) { + MPI_Allreduce( dpt, work, nfloats, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD ); + for (i=0; i N_ELEM) free(work); + timer_end( &g_sum_timer ); + + /** + work = (float *)malloc(nfloats*sizeof(float)); + MPI_Allreduce( dpt, work, nfloats, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD ); + for(i=0;i N_ELEM) free(work); + timer_end( &g_sum_timer ); + + /** + work = (double *)malloc(ndoubles*sizeof(double)); + MPI_Allreduce( dpt, work, ndoubles, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); + for(i=0;istatus == GATHER_FOLLOW && (a->gathered[dir] ^ parity) == 0) { +#ifdef CHECK_GATHER_FIELDS + /* now looks like is gathered, check if the checkup field has changed */ + if (((parity & EVEN) && (*((unsigned int *)field) != a->check_even[dir] )) || + ((parity & ODD ) && (*((unsigned int *)(field+size*(node.sites-1))) + != a->check_odd[dir] ))) { + printf(" #### GATHER CHECK: Forgotten mark_changed() somewhere!\n"); + if (size < sizeof(int)) + printf(" Because field size %d < sizeof(int), can be spurious\n",size); + else { + printf(" Field size %d chars, dir %d parity %d\n",size, dir, parity); + } + halt(" ###### "); + } +#endif + n_gather_avoided++; + return(1); + } + n_gather_done++; + return(0); +} + +void gather_status_reset( char *field, int size ) +{ + struct gather_status_arr *a; + /* cast the latfield array */ + a = (struct gather_status_arr *) (field + size*node.latfield_size); + a->status = GATHER_NOT_FOLLOW; +} + +void gather_mark_dirty( char *field, int size, int parity ) +{ + int dir,p; + struct gather_status_arr *a; + + /* cast the latfield array */ + a = (struct gather_status_arr *) (field + size*node.latfield_size); + a->status = GATHER_FOLLOW; + p = opp_parity(parity); + /* mark opposite parity, because will fetch from there! */ + /* Remember that need to mark opposite directions too! */ + for(dir=0; dirgathered[dir] &= (!p); +} + +void gather_mark_gathered( char *field, int size, int dir, int parity ) +{ + struct gather_status_arr *a; + + /* cast the latfield array */ + a = (struct gather_status_arr *) (field + size*node.latfield_size); + if (a->status == GATHER_FOLLOW) a->gathered[dir] |= parity; +#ifdef CHECK_GATHER_FIELDS + if (parity & EVEN) a->check_even[dir] = *((unsigned int *)field); + if (parity & ODD ) a->check_odd[dir] = + *((unsigned int *)(field+size*(node.sites-1))); +#endif +} + + +/****************************************************************/ + + + +/* version of exit for multinode processes -- kill all nodes */ +void terminate(int status) +{ + printf("Termination: node %d, status = %d\n",this_node,status); + fflush(stdout); + MPI_Abort( MPI_COMM_WORLD, 0); + exit(status); +} + + + + +/* clean exit from all nodes */ +void finishrun() +{ +#ifdef TIMERS + report_comm_timers(); +#endif + + if (this_node == 0) { + extern int n_gather_done,n_gather_avoided; + + printf(" COMMS from node 0: %d done, %d (%.2g%%) optimized away\n", + n_gather_done, n_gather_avoided, + 100.0*n_gather_avoided/(n_gather_avoided+n_gather_done)); + } + + fflush(stdout); + fflush(NULL); /* for all open files */ + MPI_Finalize(); + exit(0); +} + + +void report_comm_timers() +{ +#ifdef TIMERS + double tot; + if (this_node == 0) { + printf(" *************************\n"); + printf(" MPI communications timers from node 0:\n"); + + printf(" start_get: "); + timer_report( &start_gather_timer ); + printf(" waiting send: "); + timer_report( &wait_send_timer ); + printf(" receive: "); + timer_report( &wait_receive_timer ); + printf(" g_sync: "); + timer_report( &g_sync_timer ); + printf(" g_sum: "); + timer_report( &g_sum_timer ); + printf(" broadcast: "); + timer_report( &broadcast_timer ); + printf(" send/receive: "); + timer_report( &send_timer ); + printf(" Total time from gather start -> end (does not count against comm.time)\n"); + printf(" "); + timer_report( &total_gather_timer ); + + printf(" Moved data:\n"); + printf(" * send/receive %g MB, bandwith %g MB/sec\n", + total_sent_data*1e-6, total_sent_data*1e-6/send_timer.total); + + printf(" * total pushed/pulled data %g MB\n",total_gather_data*1e-6); + printf(" * with optimistic bandwidth %g MB/sec, pessimistic %g MB/s\n", + total_gather_data*1e-6/ + (start_gather_timer.total + wait_send_timer.total + + wait_receive_timer.total), + total_gather_data*1e-6/total_gather_timer.total); + + tot = start_gather_timer.total + wait_send_timer.total + wait_receive_timer.total + + g_sync_timer.total + + g_sum_timer.total + broadcast_timer.total + send_timer.total ; + + /* find current time */ + timer_end( &total_time ); + + printf(" Total comm. time %.3g, total time %.3g, comm %.2g%%\n", + tot, total_time.total, 100*tot/total_time.total ); + printf(" ***** \n"); + } +#endif +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_mpi_2.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_mpi_2.c new file mode 100644 index 0000000000000000000000000000000000000000..343d49c9570e2a189cd8be8918a3b59a512c04fa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_mpi_2.c @@ -0,0 +1,728 @@ +/****************** com_mpi.c ***************************************** + * + * Communications routines, for MPI interface + * Modified from the MILC lattice QCD one + * KR 2001 + * + + + + g_sync() provides a synchronization point for all nodes. + g_floatsum() sums a floating point number over all nodes. + g_doublesum() sums a double over all nodes. + g_vecdoublesum() sums a vector of doubles over all nodes. + g_floatmax() finds maximum of a floating point number over all nodes. + g_doublemax() finds maximum of a double over all nodes. + broadcast_float() broadcasts a single precision number from + node 0 to all nodes. + broadcast_double() broadcasts a double precision number + send_integer() sends an integer to one other node + receive_integer() receives an integer + terminate() kills the job on all processors + + start_gather() starts asynchronous sends and receives required + to gather neighbors. + wait_gather() waits for receives to finish, insuring that the + data has actually arrived. + + send_field() sends a field to one other node. + receive_field() receives a field from some other node. +*/ + +/* load in definitions, variables etc. */ + +#include "comdefs.h" +#include "generic.h" +#include "timers.h" /* includes comm timer calculators */ + +#ifdef TIMERS +static timer_type start_gather_timer, wait_send_timer, wait_receive_timer, g_sync_timer, + g_sum_timer, broadcast_timer, send_timer, total_time; +#endif +static double mpi_sent_data = 0.0; + + +#define MIN_GATHER_INDEX 100 +#define MAX_GATHER_INDEX 7100 /* allows 7000 concurrent gathers */ +#define FIELD_TYPE 11 /* used in send/receive field */ +#define SEND_INTEGER_TYPE 12 /* used in send/receive int */ + +#define SEND_FLAG 1 /* flags for marking msg_tags */ +#define RECEIVE_FLAG 2 + +extern comlist_struct *comlist; /* the comlist variables in layout.c */ + +/************************************************************************/ + +/* get all msg_tags in a single array, avoid allocating + * small bits and pieces of messages + */ + +#define N_MSG_TAG 40 + +static msg_tag msg_tag_arr[N_MSG_TAG]; +static msg_tag *msg_tag_free, *msg_tag_last; + +void init_msg_tags() +{ + int i; + + for (i=0; inext; + } + + /* p points to the last in the list */ + msg_tag_free = p->next; + p->next = NULL; + return(r); +} + + +void release_msg_tags(msg_tag *tp) +{ + msg_tag *p; + + for (p=tp; p->next != NULL; p=p->next) ; + p->next = msg_tag_free; + msg_tag_free = tp; +} + +/************************************************************************/ + + +/* Machine initialization */ +#include +void initialize_machine() { +/* MPI_Init(&argc,&argv); */ + +#ifdef TIMERS + timer_reset( &total_time ); + timer_start( &total_time ); + + timer_reset( &start_gather_timer ); + timer_reset( &wait_send_timer ); + timer_reset( &wait_receive_timer ); + timer_reset( &g_sync_timer ); + timer_reset( &g_sum_timer ); + timer_reset( &broadcast_timer ); + timer_reset( &send_timer ); +#endif + + init_msg_tags(); + +} + + +/************************************************************************/ + +/* this formats the wait_array, used by forallsites_waitA() + * should be made as fast as possible! + */ + +int setup_wait_arr( unsigned char *wait_arr, msg_tag* tag_out[], + msg_tag* tag_in[], int ntag ) +{ + register int i,j; + int nt,dir[NA_MAX]; + + if (ntag > NA_MAX) halt("Error in forallsites_waitA: too many gathers"); + for (nt=i=0; idir; + nt++; + } + tag_out[nt] = (msg_tag *)NULL; /* needed for the last loop */ + + forallsites(i) { + wait_arr[i] = 0; /* basic, no wait */ + for (j=0; j= node.sites ) wait_arr[i] = j+1; + } + return( nt ); +} + + +/************************************************************************/ +/* GATHER ROUTINES */ +/* start_gather() returns a pointer to a list of msg_tag's, which + be used as input to subsequent wait_gather(). + + This list contains msg_tags for all receive buffers, followed by + end flag. + + If no messages at all are required, the routine will return NULL. + msg_buf==NULL should be a reliable indicator of no message. + + usage: tag = start_gather( source, size, direction, parity ) +*/ + + + +msg_tag* start_gather( field, size, dir, parity ) + /* arguments */ + char * field; /* pointer to some latfield */ + int size; /* size in bytes of the field (eg sizeof(su3_vector))*/ + int dir; /* direction to gather from. eg XUP - index into + neighbor tables */ + int parity; /* parity of sites whose neighbors we gather. + one of EVEN, ODD or EVENODD (EVEN+ODD). */ +{ + /* local variables */ + int i,j,k,nodepar; /* scratch */ + int offset; /* number of sites in this receive or send */ + int *idx; /* index array pointer */ + int nsites; + char *tpt; /* temp ptr to buffer */ + msg_tag *mbuf, *mp; /* list of message tags, to be returned */ + comlist_struct *cp; + send_struct *sp; + receive_struct *rp; + static int index = MIN_GATHER_INDEX; /* index to identify the operation */ + + + if (dir < 0 || dir >= NDIRS) { + printf("No such gather %d, node %d\n",dir,mynode()); + terminate(1212); + } + + /* First, get the rolling index for the operation + * This MUST BE HERE even if there's nothing to do, because + * somebody else might be doing this! + */ + ++index ; if (index > MAX_GATHER_INDEX) index = MIN_GATHER_INDEX; + + cp = &comlist[dir]; + + /* Now if there's nothing to do, return - CHECK IF GATHERED */ + if( ( cp->n_send == 0 && cp->n_receive == 0 ) + || is_already_gathered( field, size, dir, parity) ) + return( (msg_tag *) NULL ); + + /* mark gathered, if needed */ + gather_mark_gathered( field, size, dir, parity ); + + timer_start( &start_gather_timer ); + + /* allocate a buffer for the msg_tags. This is dynamically allocated + because there may be an arbitrary number of gathers in progress + in any direction. SIZE = n_msgs sent+received, for end flag */ + + /* mbuf = (msg_tag *)malloc((cp->n_send + cp->n_receive + 1)*sizeof(msg_tag) ); + if(mbuf==NULL){ + printf("No room for mbuf, node %d\n",mynode()); + terminate(1212); + } + */ + + mbuf = get_msg_tags(cp->n_send + cp->n_receive); + + /* Loop over node parity */ + forbothparities(nodepar) { + + mp=mbuf; + /* HANDLE RECEIVES: loop over nodes which will send here */ + for (i=0, rp=cp->from_node; in_receive;i++, rp=rp->next, mp=mp->next) { + + if (nodepar == node.parity) { + MPI_Status status; + + /* note--neighbors of EVEN sites are always first in the list! + * Thus, for ODD sites we must change the offset + */ + switch (parity) { + case EVEN: nsites = rp->n_even; offset = rp->offset; break; + case ODD: nsites = rp->n_odd; offset = rp->offset + rp->n_even; break; + case EVENODD: nsites = rp->n; offset = rp->offset; break; + } + + mp->flag = RECEIVE_FLAG; /* flag as normal receive */ + mp->dir = dir; + /* and post receive -- comes right on spot */ + MPI_Recv( ((char *)field) + offset*size, nsites*size, MPI_BYTE, + rp->node, index, MPI_COMM_WORLD, &status ); + } + } + + /* HANDLE SENDS - note: mp automatically correct */ + for(k=0,sp=cp->to_node; k < cp->n_send; k++,sp = sp->next, mp = mp->next) { + if (nodepar != node.parity) { + switch (parity) { + case EVEN: nsites = sp->n_even; offset = 0; break; + case ODD: nsites = sp->n_odd; offset = sp->n_even; break; + case EVENODD: nsites = sp->n; offset = 0; break; + } + + /* allocate buffer */ + tpt = (char *)malloc( nsites*size ); + if(tpt==NULL){printf("NO ROOM for tpt, node %d\n",mynode());exit(1);} + mp->flag = SEND_FLAG; /* flag as send */ + mp->dir = dir; + mp->buf = tpt; + /* gather data into the buffer */ + + idx = sp->sitelist + offset; /* initial offset */ + for (j=0; jbuf, nsites*size, MPI_BYTE, + sp->node, index, MPI_COMM_WORLD ); + + free( mp->buf ); + } + } + + } + + timer_end( &start_gather_timer ); + + release_msg_tags( mbuf ); + + /* return */ + return((msg_tag*)NULL); +} + + +msg_tag * wait_gather( msg_tag *mbuf ) +{ + MPI_Status status; + msg_tag *mp; + + if (mbuf == NULL) return((msg_tag *)NULL); + timer_start( &wait_receive_timer ); + /* wait for all receive messages */ + for(mp=mbuf; mp != NULL && mp->flag == RECEIVE_FLAG; mp=mp->next) { + MPI_Wait( &(mp->mpi), &status ); + } + timer_end( &wait_receive_timer ); + /* wait for all send messages */ + timer_start( &wait_send_timer ); +#ifndef SEND_TEST + for( ; mp != NULL && mp->flag == SEND_FLAG; mp=mp->next) { + MPI_Wait( &(mp->mpi), &status ); + /* release the buffer */ + free( mp->buf ); + } +#endif + timer_end( &wait_send_timer ); + /* and free the mbuf */ + release_msg_tags( mbuf ); + return((msg_tag *)NULL); +} + + +/**************************************************************** + */ + + +/* SEND AND RECEIVE FIELD */ +/* send_field is to be called only by the node doing the sending */ +/* get_field is to be called only by the node to which the field was sent */ +void send_field(buf,size,tonode) + void *buf; int size,tonode; +{ + timer_start( &send_timer ); + MPI_Send(buf,size,MPI_BYTE,tonode,FIELD_TYPE,MPI_COMM_WORLD); + timer_end( &send_timer ); + mpi_sent_data += size; +} +void receive_field(buf,size) + void *buf; int size; +{ + MPI_Status status; + + timer_start( &send_timer ); + MPI_Recv(buf,size,MPI_BYTE,MPI_ANY_SOURCE,FIELD_TYPE, + MPI_COMM_WORLD,&status); + timer_end( &send_timer ); + mpi_sent_data += size; +} + +/* BASIC COMMUNICATIONS FUNCTIONS */ + +/* Tell what kind of machine we are on */ +static char name[]="MPI (portable)"; +char * machine_type(){ + return(name); +} + +/* Return my node number */ +int mynode() +{ + int node; + MPI_Comm_rank( MPI_COMM_WORLD, &node ); + return(node); +} + +/* Return number of nodes */ +int numnodes() +{ + int nodes; + MPI_Comm_size( MPI_COMM_WORLD, &nodes ); + return(nodes); +} + +/* Synchronize all nodes */ +void g_sync() +{ + timer_start( &g_sync_timer ); + MPI_Barrier( MPI_COMM_WORLD ); + timer_end( &g_sync_timer ); +} + +/* Sum float over all nodes to node 0 */ +void g_floatsum( float * fpt ) +{ + float work; + + timer_start( &g_sum_timer ); + if (this_node == 0) { + MPI_Reduce( fpt, &work, 1, MPI_FLOAT, MPI_SUM, 0 , MPI_COMM_WORLD ); + *fpt = work; + } else { + MPI_Reduce( fpt, &work, 1, MPI_FLOAT, MPI_SUM, 0 , MPI_COMM_WORLD ); + } + timer_end( &g_sum_timer ); +} + +/* Sum double over all nodes */ +void g_doublesum( double * dpt ) +{ + double work; + + timer_start( &g_sum_timer ); + if (this_node == 0) { + MPI_Reduce( dpt, &work, 1, MPI_DOUBLE, MPI_SUM, 0 , MPI_COMM_WORLD ); + *dpt = work; + } else { + MPI_Reduce( dpt, &work, 1, MPI_DOUBLE, MPI_SUM, 0 , MPI_COMM_WORLD ); + } + timer_end( &g_sum_timer ); +} + +/* Sum double over all nodes, and scatter the result */ +void g_doublesum_scatter( double * dpt ) +{ + double work; + + timer_start( &g_sum_timer ); + MPI_Allreduce( dpt, &work, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); + *dpt = work; + timer_end( &g_sum_timer ); +} + + + +/* Sum a vector of doubles over all nodes */ +void g_vecintsum( dpt, n ) + int *dpt; int n; +{ + register int *work; + register int i; + + timer_start( &g_sum_timer ); + work = (int *)malloc(n*sizeof(int)); + if (this_node == 0) { + MPI_Reduce( dpt, work, n, MPI_INT, MPI_SUM, 0 , MPI_COMM_WORLD ); + for (i=0; istatus == GATHER_FOLLOW && (a->gathered[dir] ^ parity) == 0) { +#ifdef CHECK_GATHER_FIELDS + /* now looks like is gathered, check if the checkup field has changed */ + if (((parity & EVEN) && (*((unsigned int *)field) != a->check_even[dir] )) || + ((parity & ODD ) && (*((unsigned int *)(field+size*(node.sites-1))) + != a->check_odd[dir] ))) { + printf(" #### GATHER CHECK: Forgotten mark_changed() somewhere!\n"); + if (size < sizeof(int)) + printf(" Because field size %d < sizeof(int), can be spurious\n",size); + else { + printf(" Field size %d chars, dir %d parity %d\n",size, dir, parity); + } + halt(" ###### "); + } +#endif + n_gather_avoided++; + return(1); + } + n_gather_done++; + return(0); +} + +void gather_status_reset( char *field, int size ) +{ + struct gather_status_arr *a; + /* cast the latfield array */ + a = (struct gather_status_arr *) (field + size*node.latfield_size); + a->status = GATHER_NOT_FOLLOW; +} + +void gather_mark_dirty( char *field, int size, int parity ) +{ + int dir,p; + struct gather_status_arr *a; + + /* cast the latfield array */ + a = (struct gather_status_arr *) (field + size*node.latfield_size); + a->status = GATHER_FOLLOW; + p = opp_parity(parity); + /* mark opposite parity, because will fetch from there! */ + /* Remember that need to mark opposite directions too! */ + for(dir=0; dirgathered[dir] &= (!p); +} + +void gather_mark_gathered( char *field, int size, int dir, int parity ) +{ + struct gather_status_arr *a; + + /* cast the latfield array */ + a = (struct gather_status_arr *) (field + size*node.latfield_size); + if (a->status == GATHER_FOLLOW) a->gathered[dir] |= parity; +#ifdef CHECK_GATHER_FIELDS + if (parity & EVEN) a->check_even[dir] = *((unsigned int *)field); + if (parity & ODD ) a->check_odd[dir] = + *((unsigned int *)(field+size*(node.sites-1))); +#endif +} + + +/****************************************************************/ + + + +/* version of exit for multinode processes -- kill all nodes */ +void terminate(int status) +{ + printf("Termination: node %d, status = %d\n",this_node,status); + fflush(stdout); + MPI_Abort( MPI_COMM_WORLD, 0); + exit(status); +} + + + + +/* clean exit from all nodes */ +void finishrun() +{ +#ifdef TIMERS + report_comm_timers(); +#endif + + if (this_node == 0) { + extern int n_gather_done,n_gather_avoided; + + printf(" COMMS from node 0: %d done, %d (%.2g%%) optimized away\n", + n_gather_done, n_gather_avoided, + 100.0*n_gather_avoided/(n_gather_avoided+n_gather_done)); + } + + fflush(stdout); + fflush(NULL); /* Try to flush all open files */ + MPI_Finalize(); + exit(0); +} + + +void report_comm_timers() +{ +#ifdef TIMERS + double tot; + if (this_node == 0) { + printf(" *************************\n"); + printf(" MPI communications timers from node 0:\n"); + + printf(" start_get: "); + timer_report( &start_gather_timer ); + printf(" waiting send: "); + timer_report( &wait_send_timer ); + printf(" receive: "); + timer_report( &wait_receive_timer ); + printf(" g_sync: "); + timer_report( &g_sync_timer ); + printf(" g_sum: "); + timer_report( &g_sum_timer ); + printf(" broadcast: "); + timer_report( &broadcast_timer ); + printf(" send/receive: "); + timer_report( &send_timer ); + printf(" send/receive %g MB, bandwith %g MB/sec\n", + mpi_sent_data*1e-6, mpi_sent_data*1e-6/send_timer.total); + + tot = start_gather_timer.total + wait_send_timer.total + wait_receive_timer.total + + g_sync_timer.total + + g_sum_timer.total + broadcast_timer.total + send_timer.total ; + + /* find current time */ + timer_end( &total_time ); + + printf(" Total comm. time %.3g, total time %.3g, comm %.2g%%\n", + tot, total_time.total, 100*tot/total_time.total ); + printf(" ***** \n"); + } +#endif +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_vanilla.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_vanilla.c new file mode 100644 index 0000000000000000000000000000000000000000..76d05233bd4cfb806a42c09632c8eaf6622ae5df --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_vanilla.c @@ -0,0 +1,208 @@ +/****************** com_vanilla.c *************************************** + * + * Communications routines, for single node interface + * + * + + g_sync() provides a synchronization point for all nodes. + g_floatsum() sums a floating point number over all nodes. + g_doublesum() sums a double over all nodes. + g_vecdoublesum() sums a vector of doubles over all nodes. + g_floatmax() finds maximum of a floating point number over all nodes. + g_doublemax() finds maximum of a double over all nodes. + broadcast_float() broadcasts a single precision number from + node 0 to all nodes. + broadcast_double() broadcasts a double precision number + send_integer() sends an integer to one other node + receive_integer() receives an integer + terminate() kills the job on all processors + + start_gather() starts asynchronous sends and receives required + to gather neighbors. + wait_gather() waits for receives to finish, insuring that the + data has actually arrived. + + send_field() sends a field to one other node. + receive_field() receives a field from some other node. +*/ + +/* load in definitions, variables etc. */ + +#include "comdefs.h" +#include "generic.h" + +#define MIN_GATHER_INDEX 100 +#define MAX_GATHER_INDEX 7100 /* allows 7000 concurrent gathers */ +#define FIELD_TYPE 11 /* used in send/receive field */ +#define SEND_INTEGER_TYPE 12 /* used in send/receive int */ + +#define SEND_FLAG 1 /* flags for marking msg_tags */ +#define RECEIVE_FLAG 2 +#define END_FLAG 3 + + +extern comlist_struct *comlist; /* the comlist variables in layout */ + + +/************************************************************************/ + +/* Machine initialization */ +#include +void initialize_machine(argc,argv) int argc; char **argv; { + /* MPI_Init(&argc,&argv); */ +} + + +/* GATHER ROUTINES */ +/* start_gather() returns a pointer to a list of msg_tag's, which + be used as input to subsequent wait_gather(). + + This list contains msg_tags for all receive buffers, followed by + end flag. + + If no messages at all are required, the routine will return NULL. + msg_buf=NULL should be a reliable indicator of no message. + + usage: tag = start_gather( source, size, direction, parity ) +*/ + +msg_tag * start_gather( field, size, dir, parity ) + /* arguments */ + char * field; /* pointer to some latfield */ + int size; /* size in bytes of the field (eg sizeof(su3_vector))*/ + int dir; /* direction to gather from. eg XUP - index into + neighbor tables */ + int parity; /* parity of sites whose neighbors we gather. + one of EVEN, ODD or EVENODD (EVEN+ODD). */ +{ + return((msg_tag *)NULL); +} + + +msg_tag * wait_gather( msg_tag *mbuf ) +{ + if (mbuf == NULL) return((msg_tag *)NULL); + halt ("Wait in com_vanilla! Never happens! "); + return((msg_tag *)NULL); +} + + +/***************************************************/ + +msg_tag * start_scatter( field, size, dir, parity ) + char * field; /* pointer to some latfield */ + int size; /* size in bytes of the field (eg sizeof(su3_vector))*/ + int dir; + int parity; /* parity of sites whose neighbors we scatter + one of EVEN or ODD */ +{ + return((msg_tag *)NULL); +} + +msg_tag * wait_scatter( msg_tag *mbuf ) +{ + if (mbuf == NULL) return((msg_tag *)NULL); + halt ("Wait in com_vanilla! Never happens! "); + return((msg_tag *)NULL); +} + +/***************************************************/ + + +/* SEND AND RECEIVE FIELD */ +/* send_field is to be called only by the node doing the sending */ +/* get_field is to be called only by the node to which the field was sent */ +void send_field(buf,size,tonode) + void *buf; int size,tonode; +{ + /* MPI_Send(buf,size,MPI_BYTE,tonode,FIELD_TYPE,MPI_COMM_WORLD); */ +} +void receive_field(buf,size) + void *buf; int size; +{ + /* MPI_Status status; + MPI_Recv(buf,size,MPI_BYTE,MPI_ANY_SOURCE,FIELD_TYPE, + MPI_COMM_WORLD,&status); + */ +} + +/* BASIC COMMUNICATIONS FUNCTIONS */ + +/* Tell what kind of machine we are on */ +static char name[]="Single node (vanilla)"; +char * machine_type(){ + return(name); +} + +/* Return my node number */ +int mynode() +{ + return( 0 ); +} + +/* Return number of nodes */ +int numnodes() +{ + return( 1 ); +} + +/* Synchronize all nodes */ +void g_sync() {} + +/* Sum float over all nodes to node 0 */ +void g_floatsum( float * fpt, int dist ) {} + +/* Sum double over all nodes */ +void g_doublesum( double * dpt, int dist ) {} + +/* Sum a vector of doubles over all nodes */ +void g_vecintsum( int *dpt, int nfloats, int dist ) {} + +/* Sum a vector of doubles over all nodes */ +void g_vecfloatsum( float *dpt, int nfloats, int dist ) {} + +/* Sum a vector of doubles over all nodes */ +void g_vecdoublesum( double *dpt, int ndoubles, int dist ) {} + +/* Find maximum of float over all nodes */ +void g_floatmax( float * fpt ) {} + +/* Find maximum of double over all nodes */ +void g_doublemax( double *dpt ) {} + +/* Broadcast a whole field */ +void broadcast_field(void *pt, int size) {} + +/* Broadcast floating point number from node zero */ +void broadcast_float(float *fpt) {} + +/* Broadcast double precision floating point number from node zero */ +void broadcast_double( double *dpt ) {} + +/* Broadcast double precision floating point number from node zero */ +void broadcast_int( int *dpt ) {} + +/* Send an integer to one other node */ +/* This is to be called only by the node doing the sending */ +void send_integer(tonode,address) + int tonode; int *address; +{ } + +/* Receive an integer from another node */ +/* Note we do not check if this was really meant for us */ +void receive_integer(address) + int *address; +{ } + +/* version of exit for multinode processes -- kill all nodes */ +void terminate(int status) +{ + printf("Termination: node %d, status = %d\n",this_node,status); + exit(status); +} + +/* clean exit from all nodes */ +void finishrun() +{ + exit(1); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/comdefs.h b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/comdefs.h new file mode 100644 index 0000000000000000000000000000000000000000..b8c6daeced97112f364d0f814851865a03fef789 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/comdefs.h @@ -0,0 +1,483 @@ +/************************* comdefs.h ************************************* + * Header file to define global (and hidden from user) variables + * and define macros etc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#ifdef CAN_DO_ALLOCA +#include /* needed for alloca */ +#endif + +#ifdef MPI +#include +#endif + +#ifdef __GNUC__ +#define INLINE inline +#else +#define INLINE +#endif + +#include "radix.h" + +/* This version divides the lattice by factors of two in any of the + four directions. It prefers to divide the longest dimensions, + which mimimizes the area of the surfaces. Similarly, it prefers + to divide dimensions which have already been divided, thus not + introducing more off-node directions. + + This requires that the lattice volume be divisible by the number + of nodes, which is a power of two. + + With the "GRAYCODE" option the node numbers are gray coded so that + adjacent lattice regions will physically be on adjacent nodes + in a hypercube architecture + + With the "EVENFIRST" option the even sites are listed contiguously + in the first part of the fields, and the odd sites in the last part. +*/ + +#ifndef NO_EVENFIRST /* use evenfirst in checkerboard-type parallel update */ +#define EVENFIRST +#endif +/* #define GRAYCODE */ + +/* Ensure correct sharing of variables */ +#ifdef CONTROL +#define EXTERN +#else +#define EXTERN extern +#endif + +/* Define parity variables */ + +#define EVEN 0x01 +#define ODD 0x02 +#define EVENODD 0x03 +#define ALL EVENODD + + +/* Directions, and a macro to give the opposite direction */ +/* Also define NDIRS = number of directions */ + +#ifndef DIMENSION +#define NDIM 3 +#else +#define NDIM DIMENSION +#endif + +#define NDIRS (2*NDIM) /* number of directions */ + +#if NDIM > 1 +#define XUP 0 +#define YUP 1 +#define XDOWN (NDIRS-1-XUP) +#define YDOWN (NDIRS-1-YUP) +#if NDIM > 2 +#define ZUP 2 +#define ZDOWN (NDIRS-1-ZUP) +#if NDIM > 3 +#define TUP 3 +#define TDOWN (NDIRS-1-TUP) +#endif +#endif +#endif + +#define opp_dir(dir) (NDIRS-1-(dir)) /* Opposite direction */ +#define is_up_dir(dir) (dir < NDIM) /* is it up-direction */ + +#define coordinate(i,dir) site[i].x[dir] +#define xcoord(i) coordinate(i,XUP) +#define ycoord(i) coordinate(i,YUP) +#define zcoord(i) coordinate(i,ZUP) +#define tcoord(i) coordinate(i,TUP) + +/************** some typedefs -- lattice, node, and site specific */ + +typedef struct lattice { + int volume,size[NDIM]; +} lattice_struct; + +typedef struct node { + int sites,evensites,oddsites; + int xmin[NDIM],nodesize[NDIM]; /* coordinate min and max values */ + int down_node[NDIM],up_node[NDIM]; /* indices of nodes up and down to each direction */ + int latfield_size; /* used in allocating latfields */ +} node_struct; + +typedef struct site { + int parity, index, x[NDIM]; +} site_struct; + +/* Structure to keep track of outstanding sends and receives */ +typedef struct msg_tag_struct { + int flag,dir; /* status of the msg, direction */ + double start_time; /* start time for this gather */ + int size,nsites; /* Info about comms - used only in scatter */ + char *buf; /* buffer for the send messages */ + char *field; /* ptr to latfield - used only in scatter */ + int *sitelist; /* site list pointer - used only by scatter */ + struct msg_tag_struct *next; /* next tag in the possible list */ +#ifdef MPI + MPI_Request mpi; /* message id returned by system call */ +#endif +} msg_tag; + +/* define COMLINK structures: this is defined for all + * nn-gathers. Each node contains + * comlink[dirs], which contains a list of + * send/receive node structs for each of the gathers. + * + * sendnode contains sitelist[], which is the list + * of sites which has to be copied to send buffer (allocated) + * The sites are ALWAYS ODD FIRST! Thus, when neighb. of + * even sites are collected, odd sites are moved (and offset = std.) + * For even sites we collect odd ones. + */ + +typedef struct sendnode { + int node; /* node index to send to */ + int n_even, n_odd, n; /* number of sites to be sent */ + int *sitelist; /* list of sites to be sent */ + struct sendnode *next; +} send_struct; + +typedef struct receivenode { + int node; /* node index to receive from */ + int n_even, n_odd, n; /* number of sites to be received */ + int offset; /* offset of the fields in latfield */ + struct receivenode *next; +} receive_struct; + +typedef struct comlist { + send_struct * to_node; + receive_struct * from_node; + int n_send,n_receive; +} comlist_struct; + +/*********************************************************/ +/* These routines check if we need to do fetching or not. + */ + +#ifdef MPI + +#define CHECK_GATHER_FIELDS /* define this to have additional check */ + +typedef struct gather_status_arr { + int status; + unsigned char gathered[NDIRS]; +#ifdef CHECK_GATHER_FIELDS + /* these are used to check if there is forgotten mark_changed */ + unsigned int check_even[NDIRS], check_odd[NDIRS]; +#endif +} gather_status_arr; + +#define GATHER_STATUS_SIZE sizeof(struct gather_status_arr) + +void gather_status_reset( char *field, int size ); +void gather_mark_dirty( char *field, int size, int parity ); +int is_already_gathered( char *field, int size, int dir, int parity ); +void gather_mark_gathered( char *field, int size, int dir, int parity ); + +/* Routine for marking the field 'dirty' */ +#define mark_changed( a, parity ) \ + gather_mark_dirty( (char *)a, ((char *)&(a[1])) - ((char *)&(a[0])), parity ) + +#else +/*** NON-MPI routines ***/ + +#define GATHER_STATUS_SIZE 0 + +#define mark_changed( a, parity ) /* nothing */ + +#endif + + +/***************** Critical global variables defined here ********/ + +EXTERN int *neighb[NDIRS]; /* neighbour arrays */ +EXTERN site_struct *site; /* site array hangs here */ +EXTERN lattice_struct lattice; /* lattice defn */ +EXTERN node_struct node; /* and node information */ +EXTERN int this_node; /* number of this node */ +EXTERN int number_of_nodes; + +EXTERN int current_blocking_level[NDIM]; /* currently running blocking level */ +EXTERN lattice_struct base_lattice; /* base lattice struct */ + +/************* MACROS for latfield *****************/ + +#define nb(dir,i) neighb[dir][i] + +#define new_latfield( typ ) \ + (typ *)latfield_alloc( sizeof(typ) ) + +#define new_latfield_size( siz ) (char *)latfield_alloc( siz ) + +#define free_latfield( field ) if (field != NULL) free( field ) + +char *copy_latfield_func( char *f, int siz); +#define copy_latfield( f, typ ) (typ *)copy_latfield_func((char *)f, sizeof(typ) ) + +/*------------ Do we have alloca? ----------------*/ +#ifdef CAN_DO_ALLOCA +/* allocate the tmp_latfield from the stack */ +static char *tmp_latf_ptr_; +#define tmp_latfield( typ ) \ + ( (tmp_latf_ptr_ = alloca( node.latfield_size * sizeof(typ) \ + + GATHER_STATUS_SIZE)) == NULL ? \ + (typ *)halt("alloca() error") : (typ *)tmp_latf_ptr_ ) +#define free_tmp( ptr ) /* nothing */ + +#else /* now not can alloca */ +#define tmp_latfield( typ ) new_latfield( typ ) +#define free_tmp( ptr ) free_latfield( ptr ) +#endif + +/*------------- General field blocking -----------*/ + +#define block_field( field, b, fr ) \ + block_field_prg( field, ((char *)&(field[1])) - ((char *)&(field[0])), b, fr ) + +/************* Gathering ***************************/ + +/* async gather and wait */ +#define start_get( a, dir, parity ) \ + start_gather( (char *)a, ((char *)&(a[1])) - ((char *)&(a[0])), dir, parity ) + +#define wait_get( tg ) wait_gather( tg ) + +/* synchronous gather */ +#define get_field( a, dir, parity ) wait_get( start_get( a, dir, parity ) ) + +/* async scatter and wait */ +#define start_put( a, dir, parity ) \ + start_scatter( (char *)a, ((char *)&(a[1])) - ((char *)&(a[0])), dir, parity ) + +#define wait_put( tg ) wait_scatter( tg ) + +/************* MACROS for looping ******************/ + +#define forbothparities(parity) for (parity=EVEN; parity<=ODD; parity++) + +#define forallsites(i) for(i=0; idir; wait_loop_=1; } else wait_loop_=0; \ +for(wait_i_=0; wait_i_<=wait_loop_; wait_i_++, tag = wait_gather( tag )) \ +forallsites(i) if ((!wait_loop_) || ((wait_i_) ^ (nb(wait_dir1_,i) < node.sites) )) + +#define forallsites_wait2(i,tag1,tag2) \ +if (tag1 != NULL) wait_dir1_=tag1->dir; \ +if (tag2 != NULL) wait_dir2_=tag2->dir; \ +if (tag1 != NULL || tag2 != NULL) wait_loop_=1; else wait_loop_=0; \ +if (tag1 == NULL) wait_dir1_ = wait_dir2_; /* short circuit these */ \ +if (tag2 == NULL) wait_dir2_ = wait_dir1_; \ +for(wait_i_=0; wait_i_<=wait_loop_; \ + wait_i_++, tag1 = wait_gather(tag1), tag2 = wait_gather(tag2)) \ +forallsites(i) if ((!wait_loop_) || ((wait_i_) ^ (nb(wait_dir1_,i) < node.sites && \ + nb(wait_dir2_,i) < node.sites) )) + +#define forparity_wait(i,par,tag) \ +if (tag != NULL) { wait_dir1_=tag->dir; wait_loop_=1; } else wait_loop_=0; \ +for(wait_i_=0; wait_i_<=wait_loop_; wait_i_++, tag = wait_gather( tag )) \ +forparity(i,par) if ((!wait_loop_) || ((wait_i_) ^ (nb(wait_dir1_,i) < node.sites) )) + +/********* now, make gather with arbitrary waits -- defined in com_mpi */ +void initialize_wait_arrays(); +unsigned int setup_wait_arr( msg_tag *t[], int ntag ); +#define NA_MAX 10 /* arbitrarily 10 gathers */ +EXTERN unsigned char *wait_arr_; +static unsigned int site_mask_; + +/* here wait_loop_ is 0 or 1 */ +#define forallsites_waitA(i,tag,ntag) \ +for (site_mask_ = setup_wait_arr( tag, ntag ), \ + wait_loop_ = (site_mask_ != 0), wait_i_=0; \ + wait_i_<=wait_loop_; wait_gather_arr(tag,ntag), wait_i_++) \ +forallsites(i) if (((site_mask_ & wait_arr_[i]) != 0) == wait_i_) + +/* here wait_loop_ is 0 or 1 */ +#define forparity_waitA(i,parity,tag,ntag) \ +for (site_mask_ = setup_wait_arr( tag, ntag ), \ + wait_loop_ = (site_mask_ != 0), wait_i_=0; \ + wait_i_<=wait_loop_; wait_gather_arr(tag,ntag), wait_i_++) \ +forparity(i,parity) if (((site_mask_ & wait_arr_[i]) != 0) == wait_i_) + + +#ifdef OLD_WAIT_ARR +int setup_wait_arr( unsigned char *wait_arr, msg_tag *to[], msg_tag *ti[], int ntag ); +EXTERN msg_tag *waitA_tags[NA_MAX+1]; + +#define forallsites_waitA(i,tag,ntag) \ +for (wait_loop_ = setup_wait_arr( wait_arr_, waitA_tags, tag, ntag ), \ + wait_i_=0; wait_i_<=wait_loop_; wait_gather(waitA_tags[wait_i_]), wait_i_++ ) \ +forallsites(i) if (wait_i_ == wait_arr_[i]) + +#define forparity_waitA(i,parity,tag,ntag) \ +for (wait_loop_ = setup_wait_arr( wait_arr_, waitA_tags, tag, ntag ), \ + wait_i_=0; wait_i_<=wait_loop_; wait_gather(waitA_tags[wait_i_]), wait_i_++ ) \ +forparity(i,parity) if (wait_i_ == wait_arr_[i]) +#endif + +/* now, if we define updates inside bulk most of the above stuff is + * superfluous. Let it be for compatibility though + */ +#ifdef NODE_UPDATE +int active_link(int i,int dir); +int active_site(int i); +#endif + + +/************************************************************************/ + +#else +/** Non-MPI versions **/ +#define forallsites_wait(i,tag) forallsites(i) +#define forallsites_wait2(i,tag1,tag2) forallsites(i) +#define forparity_wait(i,par,tag) forparity(i,par) +#define forallsites_wait3(i,tag1,tag2,tag3) forallsites(i) +#define forallsites_waitA(i,tag,ntag) forallsites(i) +#define forparity_waitA(i,parity,tag,ntag) forparity(i,parity) + +#ifdef NODE_UPDATE +#define active_link(i,dir) 1 +#define active_site(i,dir) 1 +#endif + + +#endif + + +/*********************************************************/ + +void zero_arr(int x[NDIM]); + +#define forallcoordinates(x) \ + for(zero_arr(x); is_allowed_coord(x,&lattice); step_coord(x,&lattice) ) + +#define foralldir(dir) for(dir=0; dir>1))) +/* Switches EVEN and ODD, leaves EVENODD*/ + +int is_allowed_coord(int x[NDIM],lattice_struct *l); +void step_coord(int x[NDIM],lattice_struct *l); + +/********** Some helpers *********************************/ + +/*********************************************************/ + +/* Communications routines */ +void send_field(void *,int,int); +void receive_field(void *,int); +char * machine_type(); + +void g_sync(); +void g_vecintsum(int *,int,int); +void g_floatsum(float *,int); +void g_vecfloatsum(float *,int,int); +void g_doublesum(double *,int); +void g_vecdoublesum(double *,int,int); +void g_floatmax(float *); +void g_doublemax(double *); +void broadcast_field(void *p,int siz); +void broadcast_float(float *); +void broadcast_double(double *); +void broadcast_int(int *); +void send_integer(int node,int *address); +void receive_integer(int *); +double dclock(); +void terminate(); void finishrun(); + +char *memalloc(int n, int size); +char *latfield_alloc(int size); + +msg_tag *start_gather(char *field, int size, int dir, int parity ); +msg_tag *wait_gather(msg_tag *mbuf); +void wait_gather_arr(msg_tag *mbuf[],int n); + +msg_tag *start_scatter(char *field, int size, int dir, int parity ); +msg_tag *wait_scatter(msg_tag *mbuf); + + +void copy_lat_data_to_node( void *dat, int dsize, + int xmin[NDIM], int xmax[NDIM], void *t, int node ); +void copy_lat_slice( void *dat, int dsize, int dir, int slice, void *t); + + +void setup_lattice(int size[NDIM]); +void make_lattice_arrays(lattice_struct * l); +void initialize_machine(); +void make_gathers(); +char *machine_type(); +int mynode(),numnodes(); +int node_number(int loc[NDIM]), node_index(int loc[NDIM],node_struct *s); +int is_on_node(int loc[NDIM]); + +void set_blocking_level(int b[NDIM]); +void set_blocking_all(int d); +int *make_blocking_map(int b[NDIM]); +#define reset_blocking_level() set_blocking_all(0) + +void report_comm_timers(); + +#ifdef RADIX_F +#define g_radixsum g_floatsum +#define g_vecradixsum g_vecfloatsum +#define broadcast_radix broadcast_float +#elif defined(RADIX_D) +#define g_radixsum g_doublesum +#define g_vecradixsum g_vecdoublesum +#define broadcast_radix broadcast_double +#else + no radix +#endif + +#define g_veccomplexsum(a, b, c) g_vecradixsum((radix *)a, 2*(b), c) +#define g_complexsum(a, b) g_vecradixsum((radix *)(a), 2, b) + + +/**** Other protos ****/ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/copy_lat_data_to_zero.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/copy_lat_data_to_zero.c new file mode 100644 index 0000000000000000000000000000000000000000..0b08443df0550527d513a0c79c9dab76c6c9b5fa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/copy_lat_data_to_zero.c @@ -0,0 +1,106 @@ +/**************************************************************** + * This routine copies 'latfield'-distributed data from all nodes + * to node zero. It copies a box determined by coordinates + * xmin and xmax. The target field t must be already allocated + */ + +#include "comdefs.h" +#include "generic.h" + +void copy_lat_data_to_node( void *dat, int dsize, + int xmin[NDIM], int xmax[NDIM], void *t, int target_node ) +{ + int i,d,n,ok,idx; + int x[NDIM],xsiz[NDIM],nmin[NDIM],nmax[NDIM],vmin[NDIM],vmax[NDIM],siz; + extern node_struct *allnodes; /* defined in layout.c */ + node_struct *np; + char *src, *trg; + + + foralldir(d) { + if (xmin[d] < 0 || xmax[d] > lattice.size[d] || xmin[d] > xmax[d]) + halt("Block size error in copy_lat_data"); + } + + foralldir(d) xsiz[d] = xmax[d] - xmin[d] + 1; + + /* Go through the nodes */ + for (n=0; nxmin[d]; + nmax[d] = np->xmin[d] + np->nodesize[d] - 1; + vmin[d] = greater(nmin[d],xmin[d]); + vmax[d] = smaller(nmax[d],xmax[d]); + ok = ok && (vmax[d] - vmin[d] >= 0); + siz = siz * (vmax[d] - vmin[d] + 1); + } + + /* printf("NODE %d, size %d, OK %d\n",n,siz,ok); + */ + if (ok) { + /* Now is included */ + char * cp = NULL; + if (n != target_node) cp = (char *)memalloc(siz,dsize); + + if (this_node == target_node && n != target_node) { + /* node target sends ack to n, and receives */ + send_field( &siz, sizeof(int), n ); + receive_field( cp, siz*dsize ); + } + + /* copy data */ + foralldir(d) x[d] = vmin[d]; + x[0]--; /* need to subtract 1 from 1st to make the addition */ + for (i=0; i vmax[d]) { x[d] = vmin[d]; d++; } + + /* copy from cp if we're node 0 receiving, else dat */ + if (this_node == target_node && n != target_node) src = cp + i*dsize; + else src = ((char *)dat) + node_index(x,np)*dsize; + + /* copy to t in node target, else to cp */ + if (this_node != target_node) trg = cp + i*dsize; + else { + idx = 0; + for(d=NDIM-1; d>=0; d--) idx = x[d]-xmin[d] + idx*xsiz[d]; + trg = ((char *)t) + idx*dsize; + } + + memcpy( trg, src, dsize ); + } + + if (this_node != target_node) { + /* receive ack, and send the stuff */ + receive_field( &i, sizeof(int) ); + if (i != siz) halt(" copy_lat_data siz error"); + send_field( cp, siz*dsize, target_node ); + } + + if (n != target_node) free( cp ); + } + } /* if this_node == target || n */ + } /* loop over nodes */ + + g_sync(); +} + + +void copy_lat_slice(void *dat, int dsize, int dir, int slice, void *t) +{ + int d,x1[NDIM], x2[NDIM]; + + foralldir(d) { + if (d != dir) { + x1[d] = 0; + x2[d] = lattice.size[d]; + } else x1[d] = x2[d] = slice; + } + + copy_lat_data_to_node( dat, dsize, x1, x2, *t, 0 ); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/gauge_stuff.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/gauge_stuff.c new file mode 100644 index 0000000000000000000000000000000000000000..c2a985c61156b57164192cefa6159faa2dce2aad --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/gauge_stuff.c @@ -0,0 +1,465 @@ +/****** gauge_stuff.c -- ******************/ +/* MIMD version 6 */ +/* gauge action stuff for improved action +* T.D. and A.H. general gauge action updating code +* D.T. modified 5/97 +* D.T. modified 12/97, optimized gauge_force a little +* D.T. modified 3/99, gauge action in include file */ + +/**#define GFTIME**/ /* For timing gauge force calculation */ +#include "generic_includes.h" /* definitions files and prototypes */ + +#ifdef LOOPEND +#undef FORALLSITES +#define FORALLSITES(i,s) \ +{ register int loopend; loopend=sites_on_node; \ +for( i=0, s=lattice ; iTUP) +void printpath( int *path, int length ); + +#define GAUGE_ACTION_PART1 +/* defines NREPS NLOOP MAX_LENGTH MAX_NUM */ +#include +#undef GAUGE_ACTION_PART1 + +char gauge_action_description[128]; +int gauge_action_nloops=NLOOP; +int gauge_action_nreps=NREPS; +int loop_length[NLOOP]; /* lengths of various kinds of loops */ +int loop_num[NLOOP]; /* number of rotations/reflections for each kind */ + + /* table of directions, 1 for each kind of loop */ +int loop_ind[NLOOP][MAX_LENGTH]; + /* table of directions, for each rotation and reflection of each kind of + loop. tabulated with "canonical" starting point and direction. */ +int loop_table[NLOOP][MAX_NUM][MAX_LENGTH]; + /* table of coefficients in action, for various "representations" (actually, + powers of the trace) */ +float loop_coeff[NLOOP][NREPS]; + /* for each rotation/reflection, an integer distinct for each starting + point, or each cyclic permutation of the links */ +int loop_char[MAX_NUM]; + /* for each kind of loop for each rotation/reflection, the expectation + value of the loop */ +double loop_expect[NLOOP][NREPS][MAX_NUM]; + + +/* Make table of loops in action */ +void make_loop_table() { + + int perm[8],pp[8],ir[4]; + int length,iloop,i,j,chr; + int vec[MAX_LENGTH]; + int count,flag; + void char_num( int *dig, int *chr, int length); + +#define GAUGE_ACTION_PART2 +/* defines all loops and their coefficients */ +#include +#undef GAUGE_ACTION_PART2 + + for(iloop=0;iloopMAX_NUM){ + node0_printf("OOPS: MAX_NUM too small\n"); + exit(0); + } + loop_num[iloop]=count; + + } /* end reflection*/ + } /* end permutation if block */ + } /* end permutation */ + } /* end iloop */ + + /* print out the loop coefficients */ + node0_printf("loop coefficients: nloop rep loop_coeff multiplicity\n"); + for(i=0;i=0;j--) *chr= *chr*10+dig[j]; + + /* forward*/ + old=*chr; + for(j=length-1;j>=1;j--){ + newv=old-tenl*dig[j]; + newv=newv*10+dig[j]; + if(newv < *chr) *chr=newv; + old=newv; } + + /* backward*/ + for(j=0;j=0;j--) old=old*10+bdig[j]; + if(old < *chr ) *chr=old; + for(j=length-1;j>=1;j--){ + newv=old-tenl*bdig[j]; + newv=newv*10+bdig[j]; + if(newv < *chr) *chr=newv; + old=newv; } + +} /* char_num */ + +double imp_gauge_action() { + register int i; + int rep; + register site *s; + complex trace; + double g_action; + double action,act2,total_action; + int length; + + /* these are for loop_table */ + int ln,iloop; + + g_action=0.0; + + /* gauge action */ + for(iloop=0;ilooptempmat1 ); + action = 3.0 - (double)trace.real; + /* need the "3 -" for higher characters */ + total_action= (double)loop_coeff[iloop][0]*action; + act2=action; + for(rep=1;repstaple.e[j][k]=cmplx(0.0,0.0); + } END_LOOP + + ncount=0; + for(iloop=0;ilooptempmat1), &tmat1 ); + /* first we compute the fundamental term */ + new_term = loop_coeff[iloop][0]; + + /* now we add in the higher representations */ + if(NREPS > 1){ +node0_printf("WARNING: THIS CODE IS NOT TESTED\n"); exit(0); + act2=1.0; + action = 3.0 - realtrace_su3(&(st->link[dir]), + &tmat1 ); + + for(j=1;j 1 */ + + scalar_mult_add_su3_matrix( &(st->staple), &tmat1, + new_term, &(st->staple) ); + + } END_LOOP + + ncount++; + + } /* k (location in path) */ + } /* ln */ + } /* iloop */ + + /* Now multiply the staple sum by the link, then update momentum */ + FORALLSITES(i,st){ + mult_su3_na( &(st->link[dir]), &(st->staple), &tmat1 ); + momentum = (anti_hermitmat *)F_PT(st,mom_off); + uncompress_anti_hermitian( &momentum[dir], &tmat2 ); + scalar_mult_sub_su3_matrix( &tmat2, &tmat1, + eb3, &(st->staple) ); + make_anti_hermitian( &(st->staple), &momentum[dir] ); + } END_LOOP + } /* dir loop */ +#ifdef GFTIME +dtime+=dclock(); +node0_printf("GFTIME: time = %e (Symanzik1) mflops = %e\n",dtime, + nflop*volume/(1e6*dtime*numnodes()) ); +#endif +} /* imp_gauge_force.c */ + +/* Measure gauge observables: + Loops in action (time and space directions treated differently) + Polyakov loop + +*/ +void g_measure( ){ + double ss_plaquette, st_plaquette; + complex p_loop; + register int i; + register site *s; + complex trace; + double average[NREPS],action,act2,total_action; + int length; + /* these are for loop_table */ + int ln,iloop,rep; + + /* KS and BC minus signs should be out for this routine */ + d_plaquette( &ss_plaquette, &st_plaquette ); + if(this_node==0)printf("PLAQ:\t%f\t%f\n", ss_plaquette, st_plaquette ); + + p_loop = ploop(); + if(this_node==0)printf("P_LOOP:\t%e\t%e\n", p_loop.real, p_loop.imag ); + + /* gauge action, all loops that contribute */ + total_action=0.0; + for(iloop=0;ilooptempmat1 ); + average[0] += (double)trace.real; + action = 3.0 - (double)trace.real; + total_action += (double)loop_coeff[iloop][0]*action; + /* need the "3 -" for higher characters */ + act2=action; + for(rep=1;rep +void dsdu_qhb_subl(int dir, int subl) +{ +register site *st; +register int i; +int iloop, ln, k, j; +int dirs[MAX_LENGTH], length; +int path_dir[MAX_LENGTH], path_length; +su3_matrix tmat1; +int fsubl; + + assert(NREPS==1); /* This procedure designed only for NREPS = 1 */ + + FORSOMESUBLATTICE(i,st,subl) { + clear_su3mat(&(st->staple)); + } + + for(iloop=0;ilooptempmat1), &tmat1 ); + scalar_mult_add_su3_matrix(&(st->staple), &tmat1, + loop_coeff[iloop][0], &(st->staple) ); + } + } /* k (location in path) */ + } /* ln */ + } /* iloop */ + + g_sync(); + +} /* dsdu_qhb */ + +#endif /* N_SUBL32 */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/gaugefix.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/gaugefix.c new file mode 100644 index 0000000000000000000000000000000000000000..d8aaf1feb62f80a6884078a188d1dc39112808a3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/gaugefix.c @@ -0,0 +1,550 @@ +/************************** gaugefix.c *******************************/ +/* Fix Coulomb or Lorentz gauge by doing successive SU(2) gauge hits */ +/* Uses double precision global sums */ +/* MIMD version 6 */ +/* C. DeTar 10-22-90 */ +/* T. DeGrand 1993 */ +/* U.M. Heller 8-31-95 */ +/* C. DeTar 10-11-97 converted to generic */ + +/* Prototype... + + void gaugefix(int gauge_dir,radix relax_boost,int max_gauge_iter, + radix gauge_fix_tol, field_offset diffmat, field_offset sumvec, + int nvector, field_offset vector_offset[], int vector_parity[], + int nantiherm, field_offset antiherm_offset[], + int antiherm_parity[] ) + ------------------------------------------------------------------- + + NOTE: For staggered fermion applications, it is necessary to remove + the KS phases from the gauge links before calling this procedure. + See "rephase" in setup.c. + + ------------------------------------------------------------------- + EXAMPLE: Fixing only the link matrices to Coulomb gauge with scratch + space in mp (su3_matrix) and chi (su3_vector): + + gaugefix(TUP,(float)1.5,500,(float)1.0e-7, + F_OFFSET(mp),F_OFFSET(chi),0,NULL,NULL,0,NULL,NULL); + + ------------------------------------------------------------------- + EXAMPLE: Fixing Coulomb gauge with respect to the y direction + in the staggered fermion scheme and simultaneously transforming + the pseudofermion fields and gauge-momenta involved in updating: + + int nvector = 3; + field_offset vector_offset[3] = { F_OFFSET(g_rand), F_OFFSET(phi), + F_OFFSET(xxx) }; + int vector_parity[3] = { EVENANDODD, EVEN, EVEN }; + int nantiherm = 4; + field_offset antiherm_offset[4] = { F_OFFSET(mom[0]), F_OFFSET(mom[1]), + F_OFFSET(mom[2]), F_OFFSET(mom[3]) }; + field_offset antiherm_parity[4] = { EVENANDODD, EVENANDODD, EVENANDODD, + EVENANDODD } + + rephase( OFF ); + gaugefix(YUP,(float)1.8,500,(float)2.0e-6, + F_OFFSET(tempmat1),F_OFFSET(tempvec[0]), + nvector,vector_offset,vector_parity, + nantiherm,antiherm_offset,antiherm_parity); + rephase( ON ); + + ------------------------------------------------------------------- + + gauge_dir specifies the direction of the "time"-like hyperplane + for the purposes of defining Coulomb or Lorentz gauge + TUP for evaluating propagators in the time-like direction + ZUP for screening lengths. + 8 for Lorentz gauge + relax_boost Overrelaxation parameter + max_gauge_iter Maximum number of iterations + gauge_fix_tol Stop if change is less than this + diffmat Scratch space for an su3 matrix + sumvec Scratch space for an su3 vector + NOTE: if diffmat or sumvec are negative, gaugefix mallocs its own + scratch space. */ + +#include "generic_includes.h" + +/* Generic definitions - could be useful elsewhere */ + +/* CDIF(a,b) a -= b */ + /* a -= b */ +#define CDIF(a,b) { (a).real -= (b).real; (a).imag -= (b).imag; } + +/* Scratch space */ + +su3_matrix *diffmatp; /* malloced diffmat pointer */ +su3_vector *sumvecp; /* malloced sumvec pointer */ +field_offset diffmat_offset,sumvec_offset; /* field offsets */ + +void mult_su2_mat_vec_elem_n(su2_matrix *u,complex *x0,complex *x1) +{ + /* Multiplies the complex column spinor (x0, x1) by the SU(2) matrix u */ + /* and puts the result in (x0,x1). */ + /* Thus x <- u * x */ + /* C. DeTar 3 Oct 1990 */ + + complex z0, z1, t0, t1; + + t0 = *x0; t1 = *x1; + + CMUL(u->e[0][0], t0, z0); + CMUL(u->e[0][1], t1, z1); + CADD(z0, z1, *x0); + CMUL(u->e[1][0], t0, z0); + CMUL(u->e[1][1], t1, z1); + CADD(z0, z1, *x1); + +} /* mult_su2_mat_vec_elem_n */ + +void mult_su2_mat_vec_elem_a(su2_matrix *u,complex *x0,complex *x1) +{ + /* Multiplies the complex row spinor (x0, x1) by the adjoint of the */ + /* SU(2) matrix u and puts the result in (x0,x1). */ + /* Thus x <- x * u-adj */ + /* C. DeTar 3 Oct 1990 */ + + complex z0, z1, t0, t1; + + t0 = *x0; t1 = *x1; + + CMUL_J(t0, u->e[0][0], z0); + CMUL_J(t1, u->e[0][1], z1); + CADD(z0, z1, *x0); + CMUL_J(t0, u->e[1][0], z0); + CMUL_J(t1, u->e[1][1], z1); + CADD(z0, z1, *x1); + +} /* mult_su2_mat_vec_elem_a */ + +void dumpsu2(su2_matrix *u) +{ + int i,j; + for(i=0;i<2;i++){ + for(j=0;j<2;j++)printf("(%.2e,%.2e)\t", + (double)u->e[i][j].real,(double)u->e[i][j].imag); + printf("\n"); + } + printf("\n"); +} + +void left_su2_hit_n(su2_matrix *u,int p,int q,su3_matrix *link) +{ + /* link <- u * link */ + /* The 0 row of the SU(2) matrix u matches row p of the SU(3) matrix */ + /* The 1 row of the SU(2) matrix u matches row q of the SU(3) matrix */ + /* C. DeTar 18 Oct 1990 */ + + register int m; + + for (m = 0; m < 3; m++) + mult_su2_mat_vec_elem_n(u, &(link->e[p][m]), &(link->e[q][m])); + +} /* left_su2_hit_n */ + +void right_su2_hit_a(su2_matrix *u,int p,int q,su3_matrix *link) +{ + /* link <- link * u adj */ + /* The 0 column of u-adjoint matches column p of the SU(3) matrix */ + /* The 1 column of u-adjoint matches column q of the SU(3) matrix */ + /* C. DeTar 18 Oct 1990 */ + + register int m; + + for (m = 0; m < 3; m++) + mult_su2_mat_vec_elem_a(u, &(link->e[m][p]), &(link->e[m][q])); + +} /*right_su2_hit_a */ + +void accum_gauge_hit(int gauge_dir,int parity) +{ + +/* Accumulates sums and differences of link matrices for determining optimum */ +/* hit for gauge fixing */ +/* Differences are kept in diffmat and the diagonal elements of the sums */ +/* in sumvec */ + + register int j; + register su3_matrix *m1,*m2; + register int dir,i; + + /* Clear sumvec and diffmat */ + + forsomeparity(i,parity) + { + if(diffmat_offset >= 0) + clear_su3mat((su3_matrix *)F_PT(s,diffmat_offset)); + else + clear_su3mat(&diffmatp[i]); + if(sumvec_offset >= 0) + clearvec((su3_vector *)F_PT(s,sumvec_offset)); + else + clearvec(&sumvecp[i]); + } + + /* Subtract upward link contributions */ + + FORSOMEPARITY(i,s,parity) + { + FORALLUPDIRBUT(gauge_dir,dir) + { + /* Upward link matrix */ + m1 = &(s->link[dir]); + if(diffmat_offset >= 0) + sub_su3_matrix((su3_matrix *)F_PT(s,diffmat_offset), + m1, (su3_matrix *)F_PT(s,diffmat_offset)); + else + sub_su3_matrix( &diffmatp[i], m1, &diffmatp[i]); + + if(sumvec_offset >= 0) + { + for(j=0;j<3;j++)CSUM( ((su3_vector *)F_PT(s,sumvec_offset))->c[j], + m1->e[j][j]); + } + else + { + for(j=0;j<3;j++)CSUM( sumvecp[i].c[j],m1->e[j][j]); + } + } + } + + /* Add downward link contributions */ + + FORSOMEPARITY(i,s,parity) + { + FORALLUPDIRBUT(gauge_dir,dir) + { + /* Downward link matrix */ + m2 = (su3_matrix *)gen_pt[dir][i]; + + if(diffmat_offset >= 0) + add_su3_matrix((su3_matrix *)F_PT(s,diffmat_offset), m2, + (su3_matrix *)F_PT(s,diffmat_offset)); + else + add_su3_matrix( &diffmatp[i], m2, &diffmatp[i]); + + if(sumvec_offset >= 0) + { + for(j=0;j<3;j++)CSUM( ((su3_vector *)F_PT(s,sumvec_offset))->c[j], + m2->e[j][j]); + } + else + { + for(j=0;j<3;j++)CSUM( sumvecp[i].c[j], m2->e[j][j]); + } + + /* Add diagonal elements to sumvec */ + } + } +} /* accum_gauge_hit */ + + +void do_hit(int gauge_dir, int parity, int p, int q, float relax_boost, + int nvector, field_offset vector_offset[], int vector_parity[], + int nantiherm, field_offset antiherm_offset[], + int antiherm_parity[] ) +{ + /* Do optimum SU(2) gauge hit for p, q subspace */ + + float a0,a1,a2,a3,asq,a0sq,x,r,xdr; + register int dir,i,j; + register site *s; + su2_matrix u; + su3_matrix htemp; + + /* Accumulate sums for determining optimum gauge hit */ + + accum_gauge_hit(gauge_dir,parity); + + FORSOMEPARITY(i,s,parity) + { + /* The SU(2) hit matrix is represented as a0 + i * Sum j (sigma j * aj)*/ + /* The locally optimum unnormalized components a0, aj are determined */ + /* from the current link in direction dir and the link downlink */ + /* in the same direction on the neighbor in the direction opposite dir */ + /* The expression is */ + /* a0 = Sum dir Tr Re 1 * (downlink dir + link dir) */ + /* aj = Sum dir Tr Im sigma j * (downlink dir - link dir) j = 1,2, 3 */ + /* where 1, sigma j are unit and Pauli matrices on the p,q subspace */ + /* + a0 = s->sumvec.c[p].real + s->sumvec.c[q].real; + a1 = s->diffmat.e[q][p].imag + s->diffmat.e[p][q].imag; + a2 = -s->diffmat.e[q][p].real + s->diffmat.e[p][q].real; + a3 = s->diffmat.e[p][p].imag - s->diffmat.e[q][q].imag; +*/ + if(sumvec_offset >= 0) + a0 = ((su3_vector *)F_PT(s,sumvec_offset))->c[p].real + + ((su3_vector *)F_PT(s,sumvec_offset))->c[q].real; + else + a0 = sumvecp[i].c[p].real + sumvecp[i].c[q].real; + + if(diffmat_offset >= 0) + { + a1 = ((su3_matrix *)F_PT(s,diffmat_offset))->e[q][p].imag + + ((su3_matrix *)F_PT(s,diffmat_offset))->e[p][q].imag; + a2 = -((su3_matrix *)F_PT(s,diffmat_offset))->e[q][p].real + + ((su3_matrix *)F_PT(s,diffmat_offset))->e[p][q].real; + a3 = ((su3_matrix *)F_PT(s,diffmat_offset))->e[p][p].imag - + ((su3_matrix *)F_PT(s,diffmat_offset))->e[q][q].imag; + } + else + { + a1 = diffmatp[i].e[q][p].imag + diffmatp[i].e[p][q].imag; + a2 = -diffmatp[i].e[q][p].real + diffmatp[i].e[p][q].real; + a3 = diffmatp[i].e[p][p].imag - diffmatp[i].e[q][q].imag; + } + + /* Over-relaxation boost */ + + /* This algorithm is designed to give little change for large |a| */ + /* and to scale up the gauge transformation by a factor of relax_boost*/ + /* for small |a| */ + + asq = a1*a1 + a2*a2 + a3*a3; + a0sq = a0*a0; + x = (relax_boost*a0sq + asq)/(a0sq + asq); + r = sqrt((double)(a0sq + x*x*asq)); + xdr = x/r; + /* Normalize and boost */ + a0 = a0/r; a1 = a1*xdr; a2 = a2*xdr; a3 = a3*xdr; + + /* Elements of SU(2) matrix */ + + u.e[0][0] = cmplx( a0, a3); + u.e[0][1] = cmplx( a2, a1); + u.e[1][0] = cmplx(-a2, a1); + u.e[1][1] = cmplx( a0,-a3); + + + /* Do SU(2) hit on all upward links */ + + FORALLUPDIR(dir) + left_su2_hit_n(&u,p,q,&(s->link[dir])); + + /* Do SU(2) hit on all downward links */ + + FORALLUPDIR(dir) + right_su2_hit_a(&u,p,q,(su3_matrix *)gen_pt[dir][i]); + + /* Transform vectors and gauge momentum if requested */ + + for(j = 0; j < nvector; j++) + + /* Do SU(2) hit on specified su3 vector for specified parity */ + + /* vector <- u * vector */ + if(vector_parity[j] == EVENANDODD || vector_parity[j] == parity) + mult_su2_mat_vec_elem_n(&u, + &((su3_vector *)F_PT(s,vector_offset[j]))->c[p], + &((su3_vector *)F_PT(s,vector_offset[j]))->c[q]); + + /* Transform antihermitian matrices if requested */ + + for(j = 0; j < nantiherm; j++) + /* antiherm <- u * antiherm * u^dagger */ + if(antiherm_parity[j] == EVENANDODD || antiherm_parity[j] == parity) + { + uncompress_anti_hermitian( + (anti_hermitmat *)F_PT(s,antiherm_offset[j]), &htemp); + /* If the next 2 steps prove too time consuming, */ + /* they can be simplified algebraically, and sped up by ~2 */ + left_su2_hit_n(&u,p,q,&htemp); + right_su2_hit_a(&u,p,q,&htemp); + make_anti_hermitian( &htemp, + (anti_hermitmat *)F_PT(s,antiherm_offset[j])); + } + } + + /* Exit with modified downward links left in communications buffer */ +} /* do_hit */ + +double get_gauge_fix_action(int gauge_dir,int parity) +{ + /* Adds up the gauge fixing action for sites of given parity */ + /* Returns average over these sites */ + /* The average is normalized to a maximum of 1 when all */ + /* links are unit matrices */ + + register int dir,i,ndir; + register site *s; + register su3_matrix *m1, *m2; + double gauge_fix_action; + complex trace; + + gauge_fix_action = 0.0; + + FORSOMEPARITY(i,s,parity) + { + FORALLUPDIRBUT(gauge_dir,dir) + { + m1 = &(s->link[dir]); + m2 = (su3_matrix *)gen_pt[dir][i]; + + trace = trace_su3(m1); + gauge_fix_action += (double)trace.real; + + trace = trace_su3(m2); + gauge_fix_action += (double)trace.real; + } + } + + /* Count number of terms to average */ + ndir = 0; FORALLUPDIRBUT(gauge_dir,dir)ndir++; + + /* Sum over all sites of this parity */ + g_doublesum( &gauge_fix_action); + + /* Average is normalized to max of 1/2 on sites of one parity */ + return(gauge_fix_action /((double)(6*ndir*nx*ny*nz*nt))); +} /* get_gauge_fix_action */ + +void gaugefixstep(int gauge_dir,double *av_gauge_fix_action,float relax_boost, + int nvector, field_offset vector_offset[], int vector_parity[], + int nantiherm, field_offset antiherm_offset[], + int antiherm_parity[] ) +{ + /* Carry out one iteration in the gauge-fixing process */ + + int parity; + msg_tag *mtag[8]; + float gauge_fix_action; + register int dir,i; + register site *s; + + /* Alternate parity to prevent interactions during gauge transformation */ + *av_gauge_fix_action = 0.; + g_sync(); + fflush(stdout); + + for(parity = ODD; parity <= EVEN; parity++) + { + /* Start gathers of downward links */ + + FORALLUPDIR(dir) + { + mtag[dir] = start_gather( F_OFFSET(link[dir]), sizeof(su3_matrix), + OPP_DIR(dir), parity, gen_pt[dir] ); + } + + /* Wait for gathers */ + + FORALLUPDIR(dir) + { + wait_gather(mtag[dir]); + } + + /* Total gauge fixing action for sites of this parity: Before */ + gauge_fix_action = get_gauge_fix_action(gauge_dir,parity); + + /* Do optimum gauge hit on various subspaces */ + + do_hit(gauge_dir,parity,0,1, relax_boost, + nvector, vector_offset, vector_parity, + nantiherm, antiherm_offset, antiherm_parity); + do_hit(gauge_dir,parity,1,2, relax_boost, + nvector, vector_offset, vector_parity, + nantiherm, antiherm_offset, antiherm_parity); + do_hit(gauge_dir,parity,2,0, relax_boost, + nvector, vector_offset, vector_parity, + nantiherm, antiherm_offset, antiherm_parity); + + /* Total gauge fixing action for sites of this parity: After */ + gauge_fix_action = get_gauge_fix_action(gauge_dir,parity); + + *av_gauge_fix_action += gauge_fix_action; + + /* Scatter downward link matrices by gathering to sites of */ + /* opposite parity */ + + FORALLUPDIR(dir) + { + /* Synchronize before scattering to be sure the new modified link */ + /* matrices are all ready to be scattered and diffmat is not */ + /* overwritten before it is used */ + g_sync(); + + /* First copy modified link for this dir */ + /* from comm buffer or node to diffmat */ + + FORSOMEPARITY(i,s,parity) + { + if(diffmat_offset >= 0) + su3mat_copy((su3_matrix *)(gen_pt[dir][i]),(su3_matrix *)F_PT(s,diffmat_offset)); + else + su3mat_copy((su3_matrix *)(gen_pt[dir][i]), &diffmatp[i]); + } + + /* Now we are finished with gen_pt[dir] */ + cleanup_gather(mtag[dir]); + + /* Synchronize to make sure the previous copy happens before the */ + /* subsequent gather below */ + g_sync(); + + /* Gather diffmat onto sites of opposite parity */ + if(diffmat_offset >= 0) + mtag[dir] = start_gather( diffmat_offset, sizeof(su3_matrix), + dir, OPP_PAR(parity), gen_pt[dir] ); + else + mtag[dir] = start_gather_from_temp( diffmatp, sizeof(su3_matrix), + dir, OPP_PAR(parity), gen_pt[dir] ); + + wait_gather(mtag[dir]); + + /* Copy modified matrices into proper location */ + + FORSOMEPARITY(i,s,OPP_PAR(parity)) + su3mat_copy((su3_matrix *)(gen_pt[dir][i]),&(s->link[dir])); + + cleanup_gather(mtag[dir]); + } + + } +} /* gaugefixstep */ + + +void gaugefix(int gauge_dir,radix relax_boost,int max_gauge_iter, + radix gauge_fix_tol, su3_matrix *diffmat, su3_vector *sumvec, + int nvector, field_offset vector_offset[], int vector_parity[], + int nantiherm, field_offset antiherm_offset[], + int antiherm_parity[] ) +{ + int gauge_iter; + int alloc_diffmat,alloc_sumvec; + double current_av, old_av, del_av; + + alloc_diffmat = alloc_sumvec = 0; + if (diffmat == NULL) { + diffmat = new_latfield( su3_matrix ); + alloc_diffmat = 1; + } + if (sumvec == NULL) { + sumvec = new_latfield( su3_vector ); + alloc_sumvec = 1; + } + + /* Do at most max_gauge_iter iterations, but stop after the second step if */ + /* the change in the avg gauge fixing action is smaller than gauge_fix_tol */ + + for (gauge_iter=0; gauge_iter < max_gauge_iter; gauge_iter++) + { + gaugefixstep(gauge_dir,¤t_av,relax_boost, + nvector, vector_offset, vector_parity, + nantiherm, antiherm_offset, antiherm_parity); + + if(gauge_iter != 0) + { + del_av = current_av - old_av; + if (fabs(del_av) < gauge_fix_tol) break; + } + old_av = current_av; + } + /* Free workspace */ + if (alloc_sumvec) free_latfield( sumvec ); + if (alloc_diffmat) free_latfield( diffmat ); + + if(this_node==0) + printf("GFIX WITHOUT REUNITARIZATION: Ended at step %d. Av gf action %.3e, delta %.3e\n", + gauge_iter,(double)current_av,(double)del_av); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/gaugefix2.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/gaugefix2.c new file mode 100644 index 0000000000000000000000000000000000000000..193f60bf080d9faa8ad04b6a8c34cf002b68a707 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/gaugefix2.c @@ -0,0 +1,332 @@ +/************************** gaugefix2.c *******************************/ +/* Fix Coulomb or Lorentz gauge by doing successive SU(2) gauge hits */ +/* Uses double precision global sums */ +/* This version does automatic reunitarization at preset intervals */ +/* MIMD version 6 */ +/* C. DeTar 10-22-90 */ +/* T. DeGrand 1993 */ +/* U.M. Heller 8-31-95 */ +/* C. DeTar 10-11-97 converted to generic */ +/* C. DeTar 12-26-97 added automatic reunitarization */ +/* C. DeTar 11-24-98 remove superfluous references to p2 (was for ks phases) */ + +/* Heavily modified by Kari Rummukainen 2005-6 */ + +/* Prototype... + +void gaugefix(int gauge_dir,double relax_boost,int max_gauge_iter, + double gauge_fix_tol, suN_matrix gauge ); + + if gauge == NULL do not return the gauge + + ------------------------------------------------------------------- + + NOTE: For staggered fermion applications, it is necessary to remove + the KS phases from the gauge links before calling this procedure. + See "rephase" in setup.c. + + ------------------------------------------------------------------- + EXAMPLE: Fixing only the link matrices to Coulomb gauge with scratch + space in mp (suN_matrix) and chi (suN_vector): + + gaugefix(TUP,1.5,500,1.0e-7,NULL); + + ------------------------------------------------------------------- + EXAMPLE: Fixing Coulomb gauge with respect to the y direction + in the staggered fermion scheme and simultaneously transforming + the pseudofermion fields and gauge-momenta involved in updating: + + int nvector = 3; + suN_vector * vec[3] = {g_rand, phi, xxx }; + int vector_parity[3] = { EVENODDODD, EVEN, EVEN }; + int nantiherm = 4; + int antiherm_parity[4] = { EVENODD, EVENODD, EVENODD, EVENODD } + + rephase( OFF ); + gauge = new_latfield(suN_matrix); + gaugefix( YUP, 1.8, 500, 2.0e-6, gauge ); + vec_fix_gauge( gauge, g_rand, EVENODD ); + vec_fix_gauge( gauge, phi, EVEN ); + vec_fix_gauge( gauge, xxx, EVEN ); + foralldir(d) ahmat_fix_gauge( gauge, mom[d], EVENODD ); + free_latfield( gauge ); + + rephase( ON ); + + ------------------------------------------------------------------- + + gauge_dir specifies the direction of the "time"-like hyperplane + for the purposes of defining Coulomb or Lorentz gauge + TUP for evaluating propagators in the time-like direction + ZUP for screening lengths. + -1 for Lorentz gauge + relax_boost Overrelaxation parameter + max_gauge_iter Maximum number of iterations + gauge_fix_tol Stop if change is less than this +*/ + +#include "lattice.h" +#define REUNIT_INTERVAL 20 + +#ifdef SU2 +DOES NOT WORK YET FOR SU2 +#endif + +/* CDIF(a,b) a -= b */ + /* a -= b */ +#define CDIF(a,b) { (a).real -= (b).real; (a).imag -= (b).imag; } + +/* Scratch space */ + +void accum_gauge_hit(int i, int gauge_dir, + su3_matrix *diffmat, su3_vector *sumvec ) +{ + +/* Accumulates sums and differences of link matrices for determining optimum */ +/* hit for gauge fixing */ +/* Differences are kept in diffmat and the diagonal elements of the sums */ +/* in sumvec */ + + register int j; + register su3_matrix *m1; + register int dir; + + /* Clear sumvec and diffmat */ + + clear_su3mat( diffmat ); + clearvec( sumvec ); + + /* Subtract upward link contributions */ + + foralldir(dir) if (dir != gauge_dir) { + int odir = opp_dir(dir); + + m1 = &(U[dir][i]); + sub_su3_matrix( diffmat, m1, diffmat); + /* Sum diagonal part */ + for(j=0; jc[j], m1->e[j][j] ); + + + /* Add downward link contributions */ + + m1 = &U[dir][nb(odir,i)]; + add_su3_matrix( diffmat, m1, diffmat ); + for(j=0; jc[j], m1->e[j][j] ); + } +} /* accum_gauge_hit */ + + +void do_hit(int gauge_dir, int parity, double relax_boost, su3_matrix *gauge ) +{ + /* Do optimum SU(2) gauge hit for p, q subspace */ + + double a0,a1,a2,a3,asq,a0sq,x,r,xdr; + int dir,i,p,q; + su2_matrix u; + su3_matrix diffmat; + su3_vector sumvec; + + /* Accumulate sums for determining optimum gauge hit - + * U's must have been fetched from down! */ + + /* accum_gauge_hit( gauge_dir, parity, diffmat, sumvec); */ + + forparity(i,parity) for (p=0; p = 1/2 + * This requires a random number generator named "dran()", returning + * a float uniformly distributed between zero and one. + */ + +#include +#include +#include + +#include "mersenne.h" +#define dran() mersenne() + +double gaussian_ran() +{ + static int iset=0; + static double gset; + register double fac,r,v1,v2; + + if (iset) { + iset = 0; + return(gset); + } + + do { + v1 = 2.0*dran() - 1.0; + v2 = 2.0*dran() - 1.0; + r = v1*v1 + v2*v2; + } while (r >= 1.0); + fac = sqrt( -log(r)/r ); + gset = v1*fac; + iset = 1; + return(v2*fac); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic.h b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic.h new file mode 100644 index 0000000000000000000000000000000000000000..2cbf58c8e711525feb7d74cbb69599fd9975aee1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic.h @@ -0,0 +1,76 @@ +/************************************************************************** + * Header file where to define headers for generic functions + * (not for communications, which are defined in comdefs.h) + */ + +/******************** Mersenne random numbers ***/ + +#include "mersenne.h" + +/** +#define MERSENNE_N 624 + +extern int mersenne_i; +extern double mersenne_array[MERSENNE_N]; + +#define mersenne() ( mersenne_i > 0 ? mersenne_array[--mersenne_i] : \ + mersenne_generate(&mersenne_i) ) + +void seed_mersenne(long); +double mersenne_generate(int *); + +**/ + +#define dran() mersenne() + + +/********************* General protos ***/ + +void initial_setup(); +void initialize_prn(long seed); +void *halt(char *); +double gaussian_ran(); +void restore_binary(FILE * f); +void save_binary(FILE * f); + +/*********************** Some defines ****/ + +#define smaller(a,b) ((a)<(b)? (a) : (b)) +#define greater(a,b) ((a)>(b)? (a) : (b)) +#define sqr(x) ((x)*(x)) +#define printf0 if (this_node != 0) { } else printf + +/********************** Parameter_io.c ***/ + +double get_d(FILE *f,char *s,int bcast); +int get_i(FILE *f,char *s,int bcast); +int get_s(FILE *f,char *s,char *target,int bcast); +int get_item(FILE *f,char *s,char *items[],int n_items, int bcast); +void print_d(FILE *f,char *s,double val); +void print_i(FILE *f,char *s,int val); +void print_s(FILE *f,char *s,char *val); + +/************************* TIMING STUFF **/ + +double added_cpu_time(); +double cputime(); +void timecheck(int iter, int maxiter, int status); +void inittimecheck(void); +int setup_timelimit(time_t t,int argc,char *argv); +void resettime(void); +void inittime(void); + +#define addtime(t) t += added_cpu_time() + +/************************ Multicanonical headers **/ + +int setmulti(); +double multi_weight(); +int mc_acceptance(int parity,double rt); +void set_mc_update(int parity); +void writemuca(); + +EXTERN int is_multicanonical, is_mucacalc; + +/**********************/ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic_complex.h b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic_complex.h new file mode 100644 index 0000000000000000000000000000000000000000..10904fda5f779915ae001b78fe1115db3a46c775 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic_complex.h @@ -0,0 +1,21 @@ +/* Definitions which convert generic MATRIX definitions to su3 + */ +#define MATRIX complex +#define mult_MATRIX_nn(a,b,c) c_mul_nn( a, b, c ) +#define mult_MATRIX_na(a,b,c) c_mul_ni( a, b, c ) +#define mult_MATRIX_an(a,b,c) c_mul_in( a, b, c ) +#define mult_MATRIX_aa(a,b,c) c_mul_ii( a, b, c ) +#define add_MATRIX(a,b,c) c_add( a, b, c ) +#define scalar_mul_MATRIX(a,s,b) c_scalar_mul( a, s, b ) + +void smooth_field_complex(complex *link[NDIM], complex *cphi, int d[NDIM], + double c_mul_0, double c_mul_1); + +complex *block_field_complex(complex *f, int newlev[NDIM], int free_old); + +void block_link_complex( complex *oldl[NDIM], complex *newl[NDIM], + int newlev[NDIM], int free_old ); + +void smooth_link_complex( complex *link[NDIM], int d1[NDIM], int d2[NDIM], + double c_mul_0, double c_mul_1); + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic_su2.h b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic_su2.h new file mode 100644 index 0000000000000000000000000000000000000000..437e95493b5da3dc402140822e084b3fd08d716b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic_su2.h @@ -0,0 +1,25 @@ +/* Definitions which convert generic MATRIX definitions to su3 + */ + +#define MATRIX su2_matrix + +#define mult_MATRIX_nn(a,b,c) mult_su2_nn( (a), (b), (c) ) +#define mult_MATRIX_na(a,b,c) mult_su2_na( (a), (b), (c) ) +#define mult_MATRIX_an(a,b,c) mult_su2_an( (a), (b), (c) ) +#define mult_MATRIX_aa(a,b,c) mult_su2_aa( (a), (b), (c) ) +#define add_MATRIX(a,b,c) add_su2_matrix( (a), (b), (c) ) +#define scalar_mul_MATRIX(a,s,b) su2_scalar_mul( a, s, b ) + +#define prefetch_MATRIX( a ) prefetch_matrix( a ) + +void smooth_field_su2adjoint(su2_matrix *link[NDIM], adjoint *cphi, int d[NDIM], + double c_mul_0, double c_mul_1); + +adjoint *block_field_su2adjoint(adjoint *f, int newlev[NDIM], int free_old); + +void block_link_su2( su2_matrix *oldl[NDIM], su2_matrix *newl[NDIM], + int newlev[NDIM], int free_old ); + +void smooth_link_su2( su2_matrix *link[NDIM], int d1[NDIM], int d2[NDIM], + double c_mul_0, double c_mul_1); + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic_su3.h b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic_su3.h new file mode 100644 index 0000000000000000000000000000000000000000..eb7f5b72abbb553f92216cbb737f507668d1ce36 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic_su3.h @@ -0,0 +1,31 @@ +/* Definitions which convert generic MATRIX definitions to su3 + */ + + +#define MATRIX su3_matrix + +#define mult_MATRIX_nn(a,b,c) mult_su3_nn( &(a), &(b), &(c) ) +#define mult_MATRIX_na(a,b,c) mult_su3_na( &(a), &(b), &(c) ) +#define mult_MATRIX_an(a,b,c) mult_su3_an( &(a), &(b), &(c) ) +#define mult_MATRIX_aa(a,b,c) mult_su3_aa( &(a), &(b), &(c) ) +#define add_MATRIX(a,b,c) add_su3_matrix( &(a), &(b), &(c) ) +#define scalar_mul_MATRIX(a,s,b) scalar_mult_su3_matrix( &(a), s, &(b) ) + +#define prefetch_MATRIX( a ) prefetch_matrix( a ) + +void smooth_field_su3adjoint(su3_matrix *link[NDIM], adjoint_matrix *cphi, int d[NDIM], + double c_mul_0, double c_mul_1); + +adjoint_matrix *block_field_su3adjoint(adjoint_matrix *f, int newlev[NDIM], int free_old); + +void block_link_su3( su3_matrix *oldl[NDIM], su3_matrix *newl[NDIM], + int newlev[NDIM], int free_old ); + +void smooth_link_su3( su3_matrix *link[NDIM], int d1[NDIM], int d2[NDIM], + double c_mul_0, double c_mul_1); + +void reunitarize( su3_matrix *U[NDIM]); + +#ifdef SSE_INLINE +#include "../sse/inline_sse.h" +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/io_lattice_generic.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/io_lattice_generic.c new file mode 100644 index 0000000000000000000000000000000000000000..11472b3db38424912141d8692c91ed56e24b50ad --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/io_lattice_generic.c @@ -0,0 +1,313 @@ +/*********************** io_lattice_generic.c *************************/ +/* This reads and writes a (binary) lattice + * + * NOTE: THIS HAS TO BE ENCAPSULATED BY A FILE + * io_lattice.c + * WHICH DEFINES + * + * typedef struct { } allfields; + * + * void copy_fields(int site, allfields *s) copy from all latfields to s.(whatever) + * void set_fields(allfields *s, int site) copy s.(stuff) to lattice fields + * + * #include "../generic/io_lattice_generic.c" + */ + +/* read and write a binary lattice */ + +void save_binary_lattice(FILE *f); +void restore_lattice_slow(FILE * f); +void restore_lattice_fast(FILE * f); + +/* size of max comm buffer, in bytes - used in fast routines */ +#define N_MAX_COMMSIZE 50000 + +/* stored field */ + +typedef struct { + allfields a; + int x[NDIM]; +} stored_field; + + +void restore_binary(FILE * f) +{ + int j; + + if(this_node==0) { + int ok,i; + int dims[NDIM]; + + ok = (fread(&j,sizeof(int),1,f) == 1); + if (ok && j != NDIM) { + printf("* Lattice dimension error: in file %d, expecting %d\n",j,NDIM); + halt(" ####"); + } + + ok = ok && (fread(dims,sizeof(int),NDIM,f) == NDIM); + j = 0; + foralldir(i) if (dims[i] != lattice.size[i]) j = 1; + if (ok && j) { + printf("* Lattice size error: in file "); + for(i=0; i 1) { + /* # of nodes is the same, layout is likely the same - use + * fast io mode + */ + printf(" Loading config with fast I/O\n"); + j = 1; + } else { + printf(" Loading config with slow I/O\n"); + j = 0; /* or not */ + } + if (!ok) halt("config I/O error"); + } + + broadcast_int(&j); + + if (j) restore_lattice_fast(f); + else restore_lattice_slow(f); +} + + +void save_binary(FILE *f) +{ + + /* node 0 does all the writing */ + if(this_node==0){ + int i,j; + + i = NDIM; + j = sizeof(allfields); + if (fwrite(&i,sizeof(int),1,f) != 1 || + fwrite(lattice.size,sizeof(int),NDIM,f) != NDIM || + fwrite(&j,sizeof(int),1,f) != 1 || + fwrite(&number_of_nodes,sizeof(int),1,f) != 1) + halt("Error in writing lattice header"); + } + + save_binary_lattice(f); +} + +/************************************************************* + * Slow restore routine - 1 message/site + */ + +void restore_lattice_slow(FILE * f) +{ + stored_field nf; + + g_sync(); + + if (this_node == 0) { + int *sent; + int l,i,newnode; + + /* Node 0 reads, and sends site to correct node */ + + sent = (int *)memalloc(number_of_nodes,sizeof(int)); + for (l=0; l N_MAX_COMMSIZE) + buf_size = N_MAX_COMMSIZE/sizeof(stored_field); + else buf_size = node.sites; + + n_messages = node.sites / buf_size; + last_size = node.sites % buf_size; + if (last_size > 0) n_messages++; else last_size = buf_size; + + if (this_node == 0) { + + /* first read own stuff */ + forallsites(i) { + if (fread(&nf,sizeof(stored_field),1,f) != 1) + halt("Read error in restore_binary_fast"); + if (this_node != node_number(nf.x)) + halt ("Node number error in restore_binary_fast"); + idx = node_index(nf.x,&node); + set_fields( &nf.a, idx); + } + + if (number_of_nodes > 1) { + buf = (stored_field *)memalloc(buf_size,sizeof(stored_field)); + + for (n=1; n N_MAX_COMMSIZE) + buf_size = N_MAX_COMMSIZE / sizeof(stored_field); + else buf_size = node.sites; + + n_messages = node.sites / buf_size; + last_size = node.sites % buf_size; + if (last_size > 0) n_messages++; else last_size = buf_size; + + if (this_node == 0) { + + /* first write own stuff */ + forallsites(i) { + copy_fields(i, &nf.a); + foralldir(s) nf.x[s] = coordinate(i,s); + if (fwrite(&nf,sizeof(stored_field),1,f) != 1) + halt("Write error in save_binary"); + } + + if (number_of_nodes > 1) { + buf = (stored_field *)memalloc(buf_size,sizeof(stored_field)); + + for (n=1; n +#include +#include +#include "complex.h" +#include "su3.h" +#include "comdefs.h" +#include "generic.h" +#include "generic_su3.h" + +#ifndef check +#define check_action(a) /* nothing */ +#endif + +#define MAX_BOP 5 /* max number of blockings */ + +/* The following are global scalars */ +EXTERN long seed; /* random number seed */ +EXTERN int mc_steps,n_measurement,n_save; +EXTERN int n_iteration,n_thermal,iteration; +EXTERN double betag; +#ifdef HIGGS +EXTERN double p_x,p_y,betaA,beta4,beta2,betay; +EXTERN int n_correlation,w_correlation; +#endif + +EXTERN double wvalue; /*for multicanonical */ +EXTERN double timeu,timea,timerest; +EXTERN double ahitu,ahitua,ahithb,ahitax,ahitmc,ahitog; /* hit*/ +EXTERN int nhitu,nhitua,nhithb,nhitax,nhitmc,nhitog; +EXTERN int meas_sync,corr_sync; + +#ifdef HIGGS +/* correlation function globals */ +EXTERN int corrlen,n_corr; +EXTERN int n_bop,n_blocking,b_level[MAX_BOP]; + +/* correlation function pointers */ +#define N_CORR 8 +EXTERN float *c_array; +EXTERN float *cr2[MAX_BOP],*cr3[MAX_BOP],*ch0[MAX_BOP],*ch1[MAX_BOP]; +EXTERN float *cH0[MAX_BOP],*cH1[MAX_BOP],*cp0[MAX_BOP],*cp1[MAX_BOP]; + +#define b_const_a1 0.2 +#define b_const_a2 (0.25*(1.0-b_const_a1)) +#define b_const_g1 0.334 +#define b_const_g2 (0.5*(1.0-b_const_g1)) + +#endif + +/***************************************************************** + * Field variables + */ + +EXTERN su3_matrix *U[NDIM]; +#ifdef HIGGS +EXTERN adjoint_matrix *ahiggs; +#endif + +/*****************************************************************/ + +#define confname "config" + +/* PABS replace status by kernel_B.input.status */ +#define statname "kernel_B.input.status" + +#define measurename "measure" +#define corrname "correl" +#define wlname "wloop" + +/* PABS replace beta by kernel_B.input.beta */ +#define betaname "kernel_B.input.beta" + +#define weightname "weight" + +/* PABS replace parameters by kernel_B.input.parameters */ +#define paramname "kernel_B.input.parameters" + +#ifndef T3E +#define prefetch_adjoint(x) /* nothing */ +#define prefetch_matrix(x) /* nothing */ +#endif + +void reunitarize(su3_matrix *link[NDIM]); +int setup(void); +void load_config(int status); +void updatehiggs(int isover); +void measure(); void writemeas(); void hcorr(); void writecorr(); +void setfiles(int restart); +void dumpall(int status,int * maxiters); +void updategauge(int isrelax); +void relax(int dir, int parity, su3_matrix *link[NDIM], su3_matrix *staple +#ifdef HIGGS + , su3_matrix *ac +#endif + ); +void monte(int dir, int parity, su3_matrix *link[NDIM], su3_matrix *staple +#ifdef HIGGS + , su3_matrix *ac +#endif + ); +void staples_su3(su3_matrix *link[NDIM], su3_matrix *staple, int dir1,int parity); +double Xoverrelax(int parity, adjoint_matrix *ahiggs, adjoint_matrix *astaple); +double HBHiggs(int parity, adjoint_matrix *ahiggs, adjoint_matrix *astaple); +double act_gauge_adj(su3_matrix *a, su3_matrix *u,adjoint_matrix *b); + +complex measure_ploop(su3_matrix *link[NDIM], int dir); + +void staple1(int i, int dir1, MATRIX *link[NDIM], MATRIX *staple) ; + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/layout.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/layout.c new file mode 100644 index 0000000000000000000000000000000000000000..60503ff611050c4855b6d6d41cdcda501fbbe6c2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/layout.c @@ -0,0 +1,733 @@ +/**************************************************************** + * * + * Hypercubic lattice layout routines * + * Based on MILC lattice QCD code, pretty much modified * + * * + * These determine the distribution of sites on nodes, * + * and do the necessary setting up. * + * * + ***************************************************************/ + +#include "comdefs.h" +#include "generic.h" + +/* static variables for node calculations */ +int squaresize[NDIM]; /* dimensions of hypercubes */ +int nsquares[NDIM]; /* number of hypercubes in each direction */ + +/* GLOBALS for communications; needed by com_XXX.c and block_lattice.c */ +node_struct *allnodes; /* structure for all nodes on this run */ +comlist_struct *comlist; /* gather pointer for all gathers */ + +#define swap(a,b) {register int t; t=a; a=b; b=t; } + +void setup_layout( int siz[NDIM] ); +void test_gather( lattice_struct *lat ); +void make_gathers( lattice_struct *lat ); + +/*************************************************************** + * BASIC CALL FOR SETUP + * + * setup_lattice(int size[NDIM]); + */ + +void setup_lattice(int siz[NDIM]) +{ + + /* first, do the basic lattice layout */ + setup_layout( siz ); + + /* then, set up the comm arrays */ + make_lattice_arrays( &lattice ); + +#ifdef MPI + /* Initialize wait_array structures */ + initialize_wait_arrays(); +#endif +} + +/***************************************************************/ + +/* number of primes to be used in factorization */ +#define NPRIMES 4 +static int prime[NPRIMES] = {2,3,5,7}; + +/* Set up now squaresize and nsquares - arrays + * Print info to stdout as we proceed + */ + +void setup_layout( int siz[NDIM] ) +{ + int n,i,j,dir,nfactors[NPRIMES]; + + if(mynode()==0){ + printf(" LAYOUT OF THE LATTICE:\n %d dimensions, layout options: ",NDIM); +#ifdef GRAYCODE + printf("GRAYCODE "); +#endif +#ifdef EVENFIRST + printf("EVENFIRST "); +#endif + printf("\n"); + fflush(stdout); + } + + /* reset the blocking level (just in case) */ + foralldir(dir) current_blocking_level[dir] = 0; + + /* static global */ + this_node = mynode(); + + /* Figure out dimensions of rectangle */ + + lattice.volume = 1; + foralldir(dir) { + nsquares[dir] = 1; + squaresize[dir] = lattice.size[dir] = siz[dir]; + lattice.volume *= lattice.size[dir]; + } + + /* store the baseline */ + base_lattice = lattice; + + if (lattice.volume % numnodes()) { + printf0(" No hope of laying out the lattice using %d nodes\n",numnodes()); + finishrun(); + } + + /* Factorize the node number in primes + * These factors must be used in slicing the lattice! + */ + i = numnodes(); + for (n=0; n=0; n--) for(i=0; ij && squaresize[dir]%prime[n] == 0 ) j=squaresize[dir]; + + /* if one direction with largest dimension has already been + divided, divide it again. Otherwise divide first direction + with largest dimension. */ + + for (dir=0; dir1 && + squaresize[dir]%prime[n] == 0) break; + + /* not previously sliced, take one direction to slice */ + if (dir >= NDIM) for (dir=0; dir= NDIM) { + /* This cannot happen! */ + printf("CANNOT HAPPEN! in layout.c\n"); + finishrun(); + } + + /* Now slice it */ + squaresize[dir] /= prime[n]; nsquares[dir] *= prime[n]; + + } + + if (mynode() == 0) { + printf(" Processor layout: "); + foralldir(dir) { + if (dir > 0) printf(" x "); + printf("%d",nsquares[dir]); + } + printf("\n Sites on node: "); + foralldir(dir) { + if (dir > 0) printf(" x "); + printf("%d",squaresize[dir]); + } + printf("\n"); + } +} + +/**************** Get the node number for (BLOCKED) coordinates */ +int node_number(int loc[NDIM]) +{ + register int i,dir; + + i = (loc[NDIM-1] << current_blocking_level[NDIM-1]) / squaresize[NDIM-1]; + for (dir=NDIM-2; dir>=0; dir--) { + i = i*nsquares[dir] + + ((loc[dir] << current_blocking_level[dir]) / squaresize[dir]); + } + +#ifdef GRAYCODE + return( i ^ (i>>1) ); /* Gray code of i */ +#else + return( i ); +#endif +} + +/************** fast routine for clarifying if we're on THIS node */ + +int is_on_node(int loc[NDIM]) +{ + register int d,dir; + + foralldir(dir) { + d = loc[dir] - node.xmin[dir]; + if (d < 0 || d >= node.nodesize[dir] ) return(0); + } + return(1); +} + +/************** give site index for ON NODE sites */ + +int node_index(int loc[NDIM], node_struct *node) +{ + int dir,l,i,s; + + i = l = loc[NDIM-1] - node->xmin[NDIM-1]; + s = loc[NDIM-1]; + for (dir=NDIM-2; dir>=0; dir--) { + l = loc[dir] - node->xmin[dir]; + i = i*node->nodesize[dir] + l; + s += loc[dir]; + } + + /* now i contains the `running index' for site */ +#ifdef EVENFIRST + if (s%2 == 0) return( i/2 ); /* even site index */ + else return( i/2 + node->evensites ); /* odd site */ +#else + return( i ); +#endif +} + +/****************************************************** + * routines for stepping through the lattice in + * coordinates, as in + * #define forallcoordinates(x) \ + * for(zero_arr(x); is_coord(x,&lattice); step_coord(x,&lattice) ) + */ + +void zero_arr(int x[NDIM]) { register int d; foralldir(d) x[d] = 0; } + +int is_allowed_coord(int x[NDIM],lattice_struct *l) +{ + int d,i; + i = 1; + foralldir(d) i = (i && (x[d] >= 0) && (x[d] < l->size[d])); + return(i); +} + +void step_coord(int x[NDIM],lattice_struct *l) +{ + int d; + + for(d=0; d= l->size[d]; x[d++] = 0) ; + + /* check if the lattice is 'full' */ + if (d >= NDIM) x[NDIM-1] = l->size[NDIM-1]; +} + + +/************** + * set up the node structure for all of the nodes in + * the run (for BLOCKED coordinates). + */ + +void setup_node( int loc[NDIM], node_struct *n ) +{ + register int offset,dir,blev,l,c0,c1,s; + + n->sites = 1; + s = 0; + foralldir(dir) { + blev = 1 << current_blocking_level[dir]; + l = loc[dir] << current_blocking_level[dir]; /* normalized coord */ + offset = l % squaresize[dir]; /* normalized coord from node 'origin' */ + c0 = l - offset; /* coordinate of the origin */ + c1 = c0 + squaresize[dir] - 1; /* coordinate of the last point */ + + /* calculate the coordinate of the first blocked point on the + * node. If the origin is divisible by the blocking factor, + * then it belongs to the blocked lattice and the coordinate is + * just l0 / blev. However, if not, then the first blocked point + * is l0 / blev + 1. + */ + if (c0 % blev == 0) n->xmin[dir] = c0 >> current_blocking_level[dir]; + else n->xmin[dir] = (c0 >> current_blocking_level[dir]) + 1; + + /* Now the coordinate of the last blocked point. This is + * always c1/blev, regardless if it is divisible or not. + */ + + c1 = c1 >> current_blocking_level[dir]; + + /* now the length of the blocked lattice */ + n->nodesize[dir] = c1 - n->xmin[dir] + 1; + + /* need to accumulate size */ + n->sites *= n->nodesize[dir]; + /* and parity of the origin */ + s += n->xmin[dir]; + } + + if ( n->sites % 2 ) { + /* now odd sized node */ + if ( s % 2 == 0) n->evensites = n->sites/2 + 1; + else n->evensites = n->sites/2; + n->oddsites = n->sites - n->evensites; + } else { + n->evensites = n->oddsites = n->sites/2; + } +} + +/************************************************************ + * set up the node struct for all nodes + */ + +node_struct * setup_nodes(lattice_struct *lat) +{ + int i,l,d,n,x[NDIM]; + node_struct *p; + + /* allocate the node array */ + p = (node_struct *)memalloc( l=numnodes(), sizeof(node_struct) ); + for (i=0; isize[d]; /* neighbour of site */ + } else { + k = opp_dir(d); + x[k] = (x[k] - 1 + lat->size[k]) % lat->size[k]; /* neighbour of site */ + } + if (is_on_node(x)) neighb[d][i] = node_index(x,&node); + else { + nodes[num] = node_number(x); + index[num] = node_index(x, allnodes + nodes[num] ); + parity[num] = site[i].parity; /* parity of THIS */ + here[num] = i; + num++; + } + } + + comlist[d].n_receive = 0; + if (num > 0) { + /* now, get the number of nodes to be gathered from */ + for (i=0; inode; j++) r = &((*r)->next); + if (j == comlist[d].n_receive) { + /* NEW NODE to receive from */ + comlist[d].n_receive++; + (*r) = p = (receive_struct *)memalloc(1,sizeof(receive_struct)); + /* and fill in the node structure */ + p->node = nodes[i]; + p->n = 1; /* first site */ + p->n_even = p->n_odd = 0; + if ( parity[i] == EVEN ) p->n_even = 1; else p->n_odd = 1; + p->next = NULL; + } else { + /* add to OLD NODE */ + p = *r; + p->n ++; + if ( parity[i] == EVEN ) p->n_even ++; else p->n_odd ++; + } + } + + /* Calculate the offsets for the gathers */ + for (j=0, p=comlist[d].from_node; jnext) { + p->offset = c_offset; + c_offset += p->n; /* and increase the offset */ + } + + /* and NOW, finish the NEIGHBOR array */ + + for (j=0, p=comlist[d].from_node; jnext) { + /* Now, accumulate the locations to itmp-array, and sort the + * array according to the index of the sending node . + * First even neighbours + */ + for (par=EVEN; par<=ODD; par++) { + for (n=i=0; inode && parity[i] == par) { + itmp[n++] = i; + /* bubble sort the tmp-array */ + for (k=n-1; k > 0 && index[itmp[k]] < index[itmp[k-1]]; k--) + swap( itmp[k], itmp[k-1] ); + } + off = p->offset; + if (par == ODD) off += p->n_even; + /* finally, root indices according to offset */ + for (k=0; k 0 */ + + /* receive done, now opposite send. This is just the gather + * inverted + */ + + od = opp_dir(d); + comlist[od].n_send = comlist[d].n_receive; + + if (num > 0) { + p = comlist[d].from_node; + for (j=0, s=&(comlist[od].to_node); jnext), p = p->next) { + (*s) = q = (send_struct *)memalloc(1,sizeof(send_struct)); + q->node = p->node; + q->n = p->n; + q->n_even = p->n_odd; /* Note the swap ! even/odd refers to type of gather */ + q->n_odd = p->n_even; + q->next = NULL; + q->sitelist = (int *)memalloc(q->n, sizeof(int)); + + /* now, initialize sitelist -- Now, we first want ODD parity, since + * this is what even gather asks for! + */ + + for (n=0,par=ODD; par>=EVEN; par--) { + for (i=0; inode && parity[i] == par) { + (q->sitelist)[n++] = here[i]; + } + if (par == ODD && n != q->n_even) halt("Parity odd error 3"); + if (par == EVEN && n != q->n) halt("Parity even error 3"); + } + } + } + } /* directions */ + + free(nodes); + free(index); + free(parity); + free(here); + free(itmp); + + /* Finally, set the site to the final offset (better be right!) */ + node.latfield_size = c_offset; + +} + + +/************************************************************************ + * Do some test to validate the correctness of the gather + */ + +typedef struct t { + int x[NDIM],parity; +} tst_struct; + + +void gather_test_error( char *abuse, int dir, tst_struct *a, + tst_struct *n, int par ) +{ + int l; + + printf(" *** %s, parity %d, from dir %d: ( ",abuse,par,dir); + foralldir(l) printf("%d ",a->x[l]); + printf(") -> ( "); + foralldir(l) printf("%d ",n->x[l]); + printf("), parity %d -> %d\n",a->parity,n->parity); +} + + +void test_gather( lattice_struct *lat ) +{ + int i,d,k,j,n,off,dir,n_err,par,checkparity; + tst_struct *a; + msg_tag *tag[NDIM]; + + a = new_latfield( tst_struct ); + + /* ignore parity if blocked lattice - usually OK */ + checkparity = 1; + foralldir(d) if (current_blocking_level[d]) checkparity = 0; + + n_err = 0; + for (k=0; k<2; k++) { + for (par=EVEN; par<=EVENODD; par++) { + + forallsites(i) { + foralldir(d) a[i].x[d] = site[i].x[d]; + a[i].parity = site[i].parity; + } + + foralldir(d) { + if (k) dir = opp_dir(d); else dir = d; + tag[d] = start_get( a, dir, par ); + } + + foralldir(d) { + if (k) dir = opp_dir(d); else dir = d; + + wait_get(tag[d]); + + if (is_up_dir(dir)) off = 1; else off = lat->size[d] - 1; + + forparity(i,par) foralldir(j) { + n = nb(dir,i); + if (( j != d && a[n].x[j] != a[i].x[j]) || + ( j == d && a[n].x[j] != ((a[i].x[j] + off) % lat->size[d])) +#ifndef IGNORE_PARITY + || (( a[i].parity != opp_parity(a[n].parity)) && checkparity ) +#endif + ) { + if (n_err < 10) + gather_test_error("HALOO! Gather error",dir,a+i,a+n,par); + n_err ++; + } + } + } + } + } + + /* test scatter too - inverse. Sensible only for EVEN or ODD */ + /* can be up or down */ + for (dir=0; dirsize[d]); + else if (d == odir) + a[n].x[d] = ((site[i].x[d] - 1 + lat->size[d])%lat->size[d]); + else + a[n].x[d] = site[i].x[d]; + } + a[n].parity = opp_parity(site[i].parity); + } + + wait_put( start_put( a, dir, par ) ); + + forparity(i,opar) { + error = 0; +#ifndef IGNORE_PARITY + if (checkparity && a[i].parity != site[i].parity) error = 1; +#endif + foralldir(d) if (a[i].x[d] != site[i].x[d]) error = 1; + if (error) { + if (n_err < 10) gather_test_error("HALOO! Scatter error", + dir,a+i,a+nb(odir,i),par); + n_err ++; + } + } + } + } + + if (n_err > 0) halt(" Lattice layout error (BUG in com_mpi.c or layout.c)"); + else printf0(" Gather/Scatter tests passed\n"); + + free_latfield(a); +} + + +/****************************************************************/ + +char *copy_latfield_func( char *f, int siz ) +{ + char *t; + + t = new_latfield_size( siz ); + memcpy( t, f, (siz * node.latfield_size) ); + return( t ); +} + +/****************************************************************/ + +char *latfield_alloc(int size) +{ + char *t; + char f[150]; + + t = (char *)malloc(node.latfield_size * size + + GATHER_STATUS_SIZE ); + if (t == NULL) { + sprintf(f,"Could not allocate a latfield of %d chars",size); + halt(f); + } + +#ifdef MPI + gather_status_reset( t, size ); +#endif + return( t ); +} + + +char *memalloc(int n, int size) +{ + char *t; + char f[150]; + + t = (char *)malloc(n * size); + if (t == NULL) { + sprintf(f,"Memalloc: could not allocate %d x %d bytes",n,size); + halt(f); + } + return( t ); +} + + +void *halt(char *s) +{ + printf("*** %s\n",s); + terminate(0); + return((void *)NULL); +} + + +void time_stamp(char *msg) +{ + time_t time_stamp; + + if (this_node == 0) { + time(&time_stamp); + if (msg != NULL) printf("%s",msg); + printf("%s\n", ctime(&time_stamp)); + fflush(stdout); + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/mersenne.h b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/mersenne.h new file mode 100644 index 0000000000000000000000000000000000000000..e2aa28ed5410e3ba426e0b474d41f7b113ee1574 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/mersenne.h @@ -0,0 +1,14 @@ +/*************************************************************** + * mersenne.h + * for the inline version of the mersenne generator + */ + +#define MERSENNE_N 624 + +extern int mersenne_i; +extern double mersenne_array[MERSENNE_N]; + +#define mersenne() ( mersenne_i > 0 ? mersenne_array[--mersenne_i] : mersenne_generate(&mersenne_i) ) + +void seed_mersenne(long a); +double mersenne_generate(int *); diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/mersenne_inline.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/mersenne_inline.c new file mode 100644 index 0000000000000000000000000000000000000000..35c1e2fd05df642c7168b30014beb60c4ec9dd5e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/mersenne_inline.c @@ -0,0 +1,117 @@ +/* A C-program for MT19937: Real number version (1998/4/6) */ +/* genrand() generates one pseudorandom real number (double) */ +/* which is uniformly distributed on [0,1]-interval, for each */ +/* call. sgenrand(seed) set initial values to the working area */ +/* of 624 words. Before genrand(), sgenrand(seed) must be */ +/* called once. (seed is any 32-bit integer except for 0). */ +/* Integer generator is obtained by modifying two lines. */ +/* Coded by Takuji Nishimura, considering the suggestions by */ +/* Topher Cooper and Marc Rieffel in July-Aug. 1997. */ + +/* This library is free software; you can redistribute it and/or */ +/* modify it under the terms of the GNU Library General Public */ +/* License as published by the Free Software Foundation; either */ +/* version 2 of the License, or (at your option) any later */ +/* version. */ +/* This library is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ +/* See the GNU Library General Public License for more details. */ +/* You should have received a copy of the GNU Library General */ +/* Public License along with this library; if not, write to the */ +/* Free Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ +/* 02111-1307 USA */ + +/* Copyright (C) 1997 Makoto Matsumoto and Takuji Nishimura. */ +/* When you use this, send an email to: matumoto@math.keio.ac.jp */ +/* with an appropriate reference to your work. */ + +/* REFERENCE */ +/* M. Matsumoto and T. Nishimura, */ +/* "Mersenne Twister: A 623-Dimensionally Equidistributed Uniform */ +/* Pseudo-Random Number Generator", */ +/* ACM Transactions on Modeling and Computer Simulation, */ +/* Vol. 8, No. 1, January 1998, pp 3--30. */ + +#include +#include + +/* Period parameters */ +#define N 624 +#define M 397 +#define MATRIX_A 0x9908b0df /* constant vector a */ +#define UPPER_MASK 0x80000000 /* most significant w-r bits */ +#define LOWER_MASK 0x7fffffff /* least significant r bits */ + +/* Tempering parameters */ +#define TEMPERING_MASK_B 0x9d2c5680 +#define TEMPERING_MASK_C 0xefc60000 +#define TEMPERING_SHIFT_U(y) (y >> 11) +#define TEMPERING_SHIFT_S(y) (y << 7) +#define TEMPERING_SHIFT_T(y) (y << 15) +#define TEMPERING_SHIFT_L(y) (y >> 18) + +static unsigned int mt[N]; /* the array for the state vector */ +int mersenne_i = -1; /* < 0 means mt[N] is not initialized */ +double mersenne_array[N]; + +/* initializing the array with a NONZERO seed */ +void +seed_mersenne(long seed) +{ + int mti; + mt[0]= seed & 0xffffffffUL; + for (mti=1; mti> 30)) + mti); + /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */ + /* In the previous versions, MSBs of the seed affect */ + /* only MSBs of the array mt[]. */ + /* 2002/01/09 modified by Makoto Matsumoto */ + mt[mti] &= 0xffffffffUL; + /* for >32 bit machines */ + } + mersenne_i = 0; +} + +double /* generating reals */ +/* unsigned int */ /* for integer generation */ +mersenne_generate(int *dummy) +{ + register unsigned int y; + register int kk; + static unsigned int mag01[2]={0x0, MATRIX_A}; + /* mag01[x] = x * MATRIX_A for x=0,1 */ + + if (mersenne_i < 0) { /* if sgenrand() has not been called, */ + printf("DUMMY: you did not seed the generator!\n"); + exit(0); + } + + /* generate N words at one time */ + + for (kk=0;kk> 1) ^ mag01[y & 0x1]; + } + for (;kk> 1) ^ mag01[y & 0x1]; + } + y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK); + mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1]; + + for (kk=0; kk mc_lim[ri]; ri++); + for ( ; p2 < mc_lim[ri]; ri--); + + /** calculate the corrected weight factor **/ + w = wr1[ri]*p2 + wr0[ri] - wr1[mc_i]*mc_par - wr0[mc_i]; + + nhitmc++; + if (exp((double)w) >= dran()) { + /* accept it */ + ahitmc += 1.0; + mc_par = p2; + radius[parity] = rad; /* save radius */ + mc_i = ri; + ok = 1; + } else { + ok = 0; + } + + if (is_mucacalc) multi_calc(); + + } /* this_node */ + + broadcast_field(&ok,sizeof(int)); + + if ( !ok ) { + j = 0; + forparity(i,parity) { + multi_field[i] = multi_buf[j++]; + } + } + g_sync(); + return ( ok ); +} + +/*************************************************** + * read in the multicanonical weight function + */ + +int readmulti() +{ + FILE *fil; + int i,j; + + if (this_node == 0) { + + if ((fil = fopen(weightname,"r")) == NULL) { + printf(" - Non-multicanonical run\n"); + is_mucacalc = is_multicanonical = 0; + + } else { + + is_multicanonical = 1; + + printf(" ***** Multicanonical - reading weight file\n"); + i = (fscanf(fil,"%d",&nweight) == 1); + + if (i) { + mc_lim = (double *)calloc(nweight+2,sizeof(double)); + wr0 = (double *)calloc(nweight+2,sizeof(double)); + wr1 = (double *)calloc(nweight+2,sizeof(double)); + wrp = (double *)calloc(nweight+2,sizeof(double)); + + for (j=1; j<=nweight && i; j++) + i = (fscanf(fil,"%lg %lg",&mc_lim[j],&wrp[j]) == 2); + } + if (!i) halt(" ** Read error in weight file"); + fclose(fil); + + mc_lim[0] = -1000; mc_lim[nweight+1] = 1000; + wrp[0] = wrp[1]; wrp[nweight+1] = wrp[nweight]; + + if ((fil = fopen(weightwrk,"r")) != NULL) { + is_mucacalc = 1; + + /* allocate balancing arrays */ + num_sweep = (double *)calloc(nweight+2,sizeof(double)); + + fscanf(fil,"%lg %lg %lg %d %d",&w_min,&w_max,&mc_delta,&tunnel,&last_up); + + printf(" ***** Weight function set up, range %g - %g\n",w_min,w_max); + printf(" ***** Starting with delta %g, tunnel %d\n",mc_delta,tunnel); + + fclose(fil); + } else is_mucacalc = 0; + + for (i=0; i 0) { + /* this is restart status */ + double t; + FILE *f; + + printf(" **** Reading MUCA-calculation files\n"); + + f = fopen(weightnew,"r"); + + fscanf(f,"%d\n",&j); + if (j != nweight) halt("Error in `weight.new'\n"); + for (i=1; i 0) num_sweep[id-1] += 3; + if (id > 1) num_sweep[id-2] += 1; + if (id < nweight-1) num_sweep[id+1] += 3; + if (id < nweight-2) num_sweep[id+2] += 1; + + nvisit = (nvisit + 1) % 8; + if (nvisit == 0) calcmulti(); +} + +/************************************************************* + * calculate muca-function again + */ + +void calcmulti() +{ + int i,j,idown,iup; + + idown = -1; + for (i=0; i mc_lim[i] ) { + if (idown < 0) idown = i; /* first index */ + wrp[i] -= (num_sweep[i] - num_sweep[idown]) * mc_delta/nweight; + iup = i; + } + else if ( w_max*lattice.volume <= mc_lim[i] ) wrp[i] = wrp[iup]; + } + + if ( last_up && num_sweep[idown] ) { + tunnel++; + last_up = 0; + } else if ( !last_up && num_sweep[iup] ) { + tunnel++; + last_up = 1; + } + + if (tunnel >= 2 ) { + mc_delta /= 4; + printf(" ** New multicanonical delta-par: %g\n",mc_delta); fflush(stdout); + tunnel = 0; + } + + for (i=0; i + +/* First, scan the file and find the value */ + +void scan_label_value(FILE *f,char *s,char *fmt,void *val) +{ + char *p,buf[200],line[200]; + + do { + if (fgets(line,198,f) == NULL) { + sprintf(buf," *** Error reading input element %s",s); + halt(buf); + } + for (p=line; *p == ' ' || *p == '\t'; p++) ; + } while (*p == '\n'); + + if (strncmp(p,s,strlen(s)) != 0) { + sprintf(buf," *** Input: should be '%s', does not match '%s'",s,p); + halt(buf); + } + p += strlen(s); + if (sscanf(p,fmt,val) != 1) { + sprintf(buf," *** Unable to get the value for input %s",s); + halt(buf); + } +} + + +double get_d(FILE *f,char *s,int bcast) +{ + double val; + + if (this_node == 0) { + scan_label_value(f,s," %lg",&val); + if (bcast >= 0) printf(" %-30s %g\n",s,val); + } else val = 0; + + if (bcast) broadcast_double( &val ); + return(val); +} + + +int get_i(FILE *f,char *s,int bcast) +{ + int val; + + if (this_node == 0) { + scan_label_value(f,s," %d",&val); + if (bcast >= 0) printf(" %-30s %d\n",s,val); + } else val = 0; + + if (bcast) broadcast_int( &val ); + return(val); +} + + +int get_s(FILE *f, char *s, char *target, int bcast) +{ + int len; + if (this_node == 0) { + scan_label_value(f,s," %s",target); + if (bcast >= 0) printf(" %-30s %s\n",s,target); + } + len = strlen(target); + + if (bcast) broadcast_field(target, len); + return(len); +} + +/* get one item from a list */ + +int get_item(FILE *f, char *s, char *items[],int n_items, int bcast) +{ + char label[200]; + int i; + + if (this_node == 0) { + scan_label_value(f,s," %s",label); + if (bcast >= 0) printf(" %-30s %s\n",s,label); + /* Find the matching string */ + for (i=0; i 1.0 ); + + /* make it su2 matrix */ + + rsq = 1.0/sqrt( rsq ); + r0 *= rsq; r1 *= rsq; r2 *= rsq; r3 *= rsq; + + m->e[0][0] = cmplx( r0, r3 ); + m->e[0][1] = cmplx( r2, r1 ); + m->e[1][0] = cmplx(-r2, r1 ); + m->e[1][1] = cmplx( r0,-r3 ); + +} + +/* Set su3 matrix to unit matrix + */ + +void su3_one(su3_matrix *r) +{ + register int i,j; + for (i=0; i<3; i++) for (j=0; j<3; j++) { + r->e[i][j].imag = 0; + if (i==j) r->e[i][j].real = 1; else r->e[i][j].real = 0; + } +} + + +/****************************************************************** + * a routine for generating a random su3 matrix. + */ + + +void random_su3P( su3_matrix *a, int n_random ) +{ + int i,ina,inb,index1,ii; + su2_cmat u; + /* su3_matrix m; + * complex t; + */ + + su3_one( a ); /* set the matrix first to unity */ + + for (i=0; i inb) { ii=ina; ina=inb; inb=ii;} + + su2_random( &u ); /* get a random su2 */ + + left_su2_hit_n( &u, ina, inb, a ); /* and hit the su3 matrix */ + + /* mat_mul_an( (*a), (*a), m ); + * t = trace_su3( &m ); + * printf("loop %d random: trace %g %g\n",i,t.real,t.imag); + */ + } + if (i % 4 == 0) reunit_su3( a ); /* keep it unitary */ + } +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/reunitarize.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/reunitarize.c new file mode 100644 index 0000000000000000000000000000000000000000..04cb00634d0443aebf5582c4409e8dcbda95a877 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/reunitarize.c @@ -0,0 +1,119 @@ +/*********************** reunitarize.c ***************************/ +/* MIMD version 3 */ + +/* reunitarize the link matrices */ +#include LATDEF + +/* canopy qcdlib code - stolen, of course */ +#define fixsu3(matrix) \ +{ \ + bj0r = (*matrix).e[0][0].real; \ + bj0i = (*matrix).e[0][0].imag; \ + bj1r = (*matrix).e[0][1].real; \ + bj1i = (*matrix).e[0][1].imag; \ + bj2r = (*matrix).e[0][2].real; \ + bj2i = (*matrix).e[0][2].imag; \ + ar = (*matrix).e[1][2].real; \ + ai = (*matrix).e[1][2].imag; \ + tr = bj1r*ar - bj1i*ai; \ + ti = bj1r*ai + bj1i*ar; \ + ar = (*matrix).e[1][1].real; \ + ai = (*matrix).e[1][1].imag; \ + tr = tr - bj2r*ar + bj2i*ai; \ + ti = ti - bj2r*ai - bj2i*ar; \ + (*matrix).e[2][0].real = tr; \ + (*matrix).e[2][0].imag = -ti; \ + ar = (*matrix).e[1][0].real; \ + ai = (*matrix).e[1][0].imag; \ + tr = bj2r*ar - bj2i*ai; \ + ti = bj2r*ai + bj2i*ar; \ + ar = (*matrix).e[1][2].real; \ + ai = (*matrix).e[1][2].imag; \ + tr = tr - bj0r*ar + bj0i*ai; \ + ti = ti - bj0r*ai - bj0i*ar; \ + (*matrix).e[2][1].real = tr; \ + (*matrix).e[2][1].imag = -ti; \ + ar = (*matrix).e[1][1].real; \ + ai = (*matrix).e[1][1].imag; \ + tr = bj0r*ar - bj0i*ai; \ + ti = bj0r*ai + bj0i*ar; \ + ar = (*matrix).e[1][0].real; \ + ai = (*matrix).e[1][0].imag; \ + tr = tr - bj1r*ar + bj1i*ai; \ + ti = ti - bj1r*ai - bj1i*ar; \ + (*matrix).e[2][2].real = tr; \ + (*matrix).e[2][2].imag = -ti; \ + } + +/* #pragma inline ( reunit_su3 ) */ + +void reunit_su3(su3_matrix *c) +{ + register float bj0r, bj0i, bj1r, bj1i, bj2r, bj2i; + register float ar, ai, tr, ti; + + /* first normalize row 0 */ + ar = (*c).e[0][0].real * (*c).e[0][0].real + /* sum of squares of row */ + (*c).e[0][0].imag * (*c).e[0][0].imag + + (*c).e[0][1].real * (*c).e[0][1].real + + (*c).e[0][1].imag * (*c).e[0][1].imag + + (*c).e[0][2].real * (*c).e[0][2].real + + (*c).e[0][2].imag * (*c).e[0][2].imag; + + ar = 1.0 / sqrt( (double)ar); /* used to normalize row */ + (*c).e[0][0].real *= ar; + (*c).e[0][0].imag *= ar; + (*c).e[0][1].real *= ar; + (*c).e[0][1].imag *= ar; + (*c).e[0][2].real *= ar; + (*c).e[0][2].imag *= ar; + + /* now make row 1 orthogonal to row 0 */ + ar = (*c).e[0][0].real * (*c).e[1][0].real + /* real part of 0 dot 1 */ + (*c).e[0][0].imag * (*c).e[1][0].imag + + (*c).e[0][1].real * (*c).e[1][1].real + + (*c).e[0][1].imag * (*c).e[1][1].imag + + (*c).e[0][2].real * (*c).e[1][2].real + + (*c).e[0][2].imag * (*c).e[1][2].imag; + ai = (*c).e[0][0].real * (*c).e[1][0].imag - /* imag part of 0 dot 1 */ + (*c).e[0][0].imag * (*c).e[1][0].real + + (*c).e[0][1].real * (*c).e[1][1].imag - + (*c).e[0][1].imag * (*c).e[1][1].real + + (*c).e[0][2].real * (*c).e[1][2].imag - + (*c).e[0][2].imag * (*c).e[1][2].real; + + /* row 2 -= a * row1 */ + (*c).e[1][0].real -= ar*(*c).e[0][0].real - ai*(*c).e[0][0].imag; + (*c).e[1][0].imag -= ar*(*c).e[0][0].imag + ai*(*c).e[0][0].real; + (*c).e[1][1].real -= ar*(*c).e[0][1].real - ai*(*c).e[0][1].imag; + (*c).e[1][1].imag -= ar*(*c).e[0][1].imag + ai*(*c).e[0][1].real; + (*c).e[1][2].real -= ar*(*c).e[0][2].real - ai*(*c).e[0][2].imag; + (*c).e[1][2].imag -= ar*(*c).e[0][2].imag + ai*(*c).e[0][2].real; + + /* now normalize row 1 */ + ar = (*c).e[1][0].real * (*c).e[1][0].real + /* sum of squares of row */ + (*c).e[1][0].imag * (*c).e[1][0].imag + + (*c).e[1][1].real * (*c).e[1][1].real + + (*c).e[1][1].imag * (*c).e[1][1].imag + + (*c).e[1][2].real * (*c).e[1][2].real + + (*c).e[1][2].imag * (*c).e[1][2].imag; + + ar = 1.0 / sqrt( (double)ar); /* used to normalize row */ + (*c).e[1][0].real *= ar; + (*c).e[1][0].imag *= ar; + (*c).e[1][1].real *= ar; + (*c).e[1][1].imag *= ar; + (*c).e[1][2].real *= ar; + (*c).e[1][2].imag *= ar; + + fixsu3(c); /* reconstruct row 2 */ + +} /* reunit_su3 */ + + +void reunitarize(su3_matrix *link[NDIM]) { + int i,dir; + + foralldir(dir) forallsites(i) reunit_su3( &link[dir][i] ); + +} /*reunitarize() */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/setup_basic.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/setup_basic.c new file mode 100644 index 0000000000000000000000000000000000000000..d219bc29d45b69f0d2fcfdee06014a9e7eba6d94 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/setup_basic.c @@ -0,0 +1,69 @@ +/******** setup_basic.c *********/ +/* MIMD code version 3 */ + +/* Here are basic setup routines, which do not depend on the + * program + */ + +#include LATDEF + +/* SETUP ROUTINES */ + + +void initial_setup() +{ + + /* First, adjust malloc so that glibc free() does not + * release space to the system + */ + +#ifdef __GNUC__ +#include "malloc.h" + mallopt( M_MMAP_MAX, 0 ); /* don't use mmap */ + /* HACK: don't release memory by calling sbrk */ + mallopt( M_TRIM_THRESHOLD, -1 ); +#endif + + /* Machine initialization first */ + initialize_machine(); + g_sync(); + + /* set the timing up */ + inittime(); + + /* basic static node variables */ + this_node = mynode(); + number_of_nodes = numnodes(); + +#ifdef __GNUC__ + printf0(" GNU c-library performance:\n using sbrk instead of mmap; not returning memory\n"); +#endif + +} + + +/************************************************** + * random number generators + */ + +void initialize_prn(long seed) +{ + int node; + + node = mynode(); + + if (seed == 0) { + if (this_node == 0) { + seed = time(NULL); + seed = seed^(seed<<26)^(seed<<9); + printf(" + Random seed from time %ld\n",seed); + } + broadcast_field(&seed,sizeof(long)); + } + seed += 1121*node; + seed = seed ^ ((532*node)<<18); + + seed_mersenne(seed); + +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/setup_files.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/setup_files.c new file mode 100644 index 0000000000000000000000000000000000000000..4f3b417abb500ae381a4a39c7e73350c63a625e7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/setup_files.c @@ -0,0 +1,82 @@ + +#include "comdefs.h" /* global variables for lattice fields */ + +int reposition(FILE *f,int nmeas); + +/************************************************** + * Set up the system for one run + */ + +FILE * setup_files(int restart, char *name, int nmeas, int mea, int *sync) +{ + e_header h; + FILE *f; + + if (this_node == 0) { + if (!restart) { + + h.headerid = E_HEADER_ID; + h.headersize = sizeof(e_header); + + h.lx = lattice.size[XUP]; h.ly = lattice.size[YUP]; + h.lz = lattice.size[ZUP]; h.lt = 1; +#if NDIM == 4 + h.lt = lattice.size[TUP]; +#endif + + h.n_float = nmeas; + h.n_long = h.n_double = h.n_char = 0; + + f = fopen(name,"w+"); + fwrite(&h,sizeof(e_header),1,f); + *sync = 0; + return(f); + } else { + + f = fopen(name,"r+"); + if (f == NULL) { + printf(" *** File %s does not exist?\n",name); + exit(0); + } + printf(" - Repositioning %s to position %d\n",name,mea); + + *sync = reposition(f,mea) + return( f ); + } + } /* this_node == 0 */ +} + +/************************************************** + * this routine repositions the measurement-files + */ + +int reposition(FILE *f,int nmeas) +{ + e_header h; + int length; + int l,j; + char *cbuf; + + fread(&h,sizeof(e_header),1,f); + j = 0; + length = h.n_double*sizeof(double) + + h.n_long*sizeof(long) + h.n_float*sizeof(float) +h.n_char*sizeof(char); + cbuf = (char *)malloc(length); + + while (j evenodd */ + + start=1; /* indicates staple sum not initialized */ + foralldir(dir2) if (dir2 != dir1) { + + odir = opp_dir(dir2); + + /* first, get link[dir2] from dir1 to all points */ + + /* get link[dir2] from direction dir1 */ + tag0 = start_get( link[dir2], dir1, EVENODD ); + + /* get link[dir1] from direction dir2 */ + tag1 = start_get( link[dir1], dir2, parity ); + + /* multiply link[dir2]^* link[dir1] link[dir2] at direction -dir2 */ + forparity_wait(i, otherparity, tag0) { + prefetch_MATRIX(&link[dir2][i+1]); + prefetch_MATRIX(&link[dir1][i+1]); + prefetch_MATRIX( &link[dir2][nb(dir1,i+1)] ); + + mult_MATRIX_an( link[dir2][i], link[dir1][i], tmat1 ); + mult_MATRIX_nn( tmat1, link[dir2][nb(dir1,i)], tmpmat[i] ); + } + + /* bottom staple ready, push up */ + tag2 = start_get( tmpmat, odir, parity ); + + wait_get(tag1); + /* just try to see what comes ..*/ + if(start){ /* this is the first contribution to staple */ + forparity(i,parity){ + prefetch_MATRIX( &link[dir2][nb(dir1,i)] ); + mult_MATRIX_nn( link[dir2][i], link[dir1][nb(dir2,i)], tmat1 ); + prefetch_MATRIX( &link[dir2][i+1] ); + prefetch_MATRIX( &link[dir1][nb(dir2,i+1)] ); + mult_MATRIX_na( tmat1, link[dir2][nb(dir1,i)], staple[i] ); + } + start=0; + } else { + forparity(i,parity){ + prefetch_MATRIX( &link[dir2][nb(dir1,i)] ); + mult_MATRIX_nn( link[dir2][i], link[dir1][nb(dir2,i)], tmat1 ); + prefetch_MATRIX( &link[dir2][i+1] ); + prefetch_MATRIX( &link[dir1][nb(dir2,i+1)] ); + mult_MATRIX_na( tmat1, link[dir2][nb(dir1,i)], tmat2 ); + add_MATRIX( staple[i], tmat2, staple[i] ); + } + } /* upper staple */ + + /* Lower staple */ + wait_get(tag2); + forparity(i,parity){ + prefetch_MATRIX( &staple[i+1] ); + prefetch_MATRIX( &tmpmat[nb(odir,i+1)]); + add_MATRIX( staple[i], tmpmat[nb(odir,i)], staple[i] ); + } /* lower staple */ + } + free_tmp( tmpmat ); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/staples_su2.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/staples_su2.c new file mode 100644 index 0000000000000000000000000000000000000000..7971f8f18e6dda24c28a8e7a9a8e90766a3a444d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/staples_su2.c @@ -0,0 +1,9 @@ +/****** staples_su3.c -- compute the staple ******************/ + +/* MIMD version 3 */ + +#include LATDEF +#include "generic_su2.h" + +#define staples_MATRIX staples_su2 +#include "staples_generic.c" diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/staples_su3.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/staples_su3.c new file mode 100644 index 0000000000000000000000000000000000000000..c239f3ba509037d2b67637f8cd647bd2851290a7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/staples_su3.c @@ -0,0 +1,9 @@ +/****** staples_su3.c -- compute the staple ******************/ + +/* MIMD version 3 */ + +#include LATDEF +#include "generic_su3.h" + +#define staples_MATRIX staples_su3 +#include "staples_generic.c" diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/su2.h b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/su2.h new file mode 100644 index 0000000000000000000000000000000000000000..671f52c9255f893b0cb722a20e6b5ed3dac698d6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/su2.h @@ -0,0 +1,234 @@ +/****************************** su2.h *********************************** + * * + * Define here the su2 + operators * + * MIMD version 3 * + * Kari Rummukainen 1997 * + */ + + +typedef struct { + radix_link a,b,c,d; +} su2_matrix; + +typedef struct { + radix a,b,c; +} adjoint; + +#ifdef T3E +void prefetch_su2(su2_matrix *); +#define prefetch_matrix(p) prefetch_su2((su2_matrix *)p) +#else +#define prefetch_matrix(par) +#define prefetch_su2(par) +#define prefetch_adjoint(par) +#endif + +#define nn_a(x,y) ( x.d*y.a + x.a*y.d - x.b*y.c + x.c*y.b) +#define nn_b(x,y) ( x.d*y.b + x.b*y.d - x.c*y.a + x.a*y.c) +#define nn_c(x,y) ( x.d*y.c + x.c*y.d - x.a*y.b + x.b*y.a) +#define nn_d(x,y) ( x.d*y.d - x.a*y.a - x.b*y.b - x.c*y.c) + +#define na_a(x,y) (-x.d*y.a + x.a*y.d + x.b*y.c - x.c*y.b) +#define na_b(x,y) (-x.d*y.b + x.b*y.d + x.c*y.a - x.a*y.c) +#define na_c(x,y) (-x.d*y.c + x.c*y.d + x.a*y.b - x.b*y.a) +#define na_d(x,y) ( x.d*y.d + x.a*y.a + x.b*y.b + x.c*y.c) + +#define an_a(x,y) ( x.d*y.a - x.a*y.d + x.b*y.c - x.c*y.b) +#define an_b(x,y) ( x.d*y.b - x.b*y.d + x.c*y.a - x.a*y.c) +#define an_c(x,y) ( x.d*y.c - x.c*y.d + x.a*y.b - x.b*y.a) +#define an_d(x,y) ( x.d*y.d + x.a*y.a + x.b*y.b + x.c*y.c) + +#define aa_a(x,y) (-x.d*y.a - x.a*y.d - x.b*y.c + x.c*y.b) +#define aa_b(x,y) (-x.d*y.b - x.b*y.d - x.c*y.a + x.a*y.c) +#define aa_c(x,y) (-x.d*y.c - x.c*y.d - x.a*y.b + x.b*y.a) +#define aa_d(x,y) ( x.d*y.d - x.a*y.a - x.b*y.b - x.c*y.c) + +#define mult_su2_nn(x,y,r) {\ +r.a = nn_a(x,y); r.b = nn_b(x,y); \ +r.c = nn_c(x,y); r.d = nn_d(x,y); } +#define mult_su2_na(x,y,r) {\ +r.a = na_a(x,y); r.b = na_b(x,y); \ +r.c = na_c(x,y); r.d = na_d(x,y); } +#define mult_su2_an(x,y,r) {\ +r.a = an_a(x,y); r.b = an_b(x,y); \ +r.c = an_c(x,y); r.d = an_d(x,y); } +#define mult_su2_aa(x,y,r) {\ +r.a = aa_a(x,y); r.b = aa_b(x,y);\ +r.c = aa_c(x,y); r.d = aa_d(x,y); } + +#define mult_su2_nn_a(x,y,r) {\ +r.a = nn_a(x,y); r.b = nn_b(x,y); r.c = nn_c(x,y); } +#define mult_su2_na_a(x,y,r) {\ +r.a = na_a(x,y); r.b = na_b(x,y); r.c = na_c(x,y); } +#define mult_su2_an_a(x,y,r) {\ +r.a = an_a(x,y); r.b = an_b(x,y); r.c = an_c(x,y); } +#define mult_su2_aa_a(x,y,r) {\ +r.a = aa_a(x,y); r.b = aa_b(x,y); r.c = aa_c(x,y); } + +#define add_su2_matrix(x,y,r) {\ +r.a = x.a + y.a; r.b = x.b + y.b; \ +r.c = x.c + y.c; r.d = x.d + y.d; } +#define sub_su2_matrix(x,y,r) {\ +r.a = x.a - y.a; r.b = x.b - y.b; \ +r.c = x.c - y.c; r.d = x.d - y.d; } +#define scalar_mult_sum_su2_matrix(x,s,r) {\ +r.a += (s)*x.a; r.b += (s)*x.b; \ +r.c += (s)*x.c; r.d += (s)*x.d; } +#define mult_su2_add(x,y,r) {\ +r.a += nn_a(x,y); r.b += nn_b(x,y); \ +r.c += nn_c(x,y); r.d += nn_d(x,y); } +#define su2_mul_inv_add(x,y,r) {\ +r.a -= nn_a(x,y); r.b -= nn_b(x,y); \ +r.c -= nn_c(x,y); r.d += nn_d(x,y); } + +#define su2_sqr(x) (x.a*x.a + x.b*x.b + x.c*x.c + x.d*x.d) +#define su2_det(x) su2_sqr(x) +#define su2_dot(x,y) (x.d*y.d - x.a*y.a - x.b*y.b - x.c*y.c) +#define su2_rdot(x,y) (x.d*y.d + x.a*y.a + x.b*y.b + x.c*y.c) +#define su2_tr(x) (2.0*x.d) +#define su2_tr2(x) x.d +#define su2_inv(x,r) { r.a=-x.a; r.b=-x.b; r.c=-x.c; r.d= x.d; } +#define su2_inv1(x) { x.a=-x.a; x.b=-x.b; x.c=-x.c; } +#define su2_cpy(x,r) r = x +#define su2_scalar_mul(x,s,r) {\ +r.a = (s)*x.a; r.b = (s)*x.b; r.c = (s)*x.c; r.d = (s)*x.d; } +#define su2_scalar_mul_add(x,s,r) {\ +r.a += (s)*x.a; r.b += (s)*x.b; r.c += (s)*x.c; r.d += (s)*x.d; } +#define su2_scalar_mul_inv_add(x,s,r) {\ +r.a -= (s)*x.a; r.b -= (s)*x.b; r.c -= (s)*x.c; r.d += (s)*x.d; } +#define su2_scalar_mul_sub(x,s,r) {\ +r.a -= (s)*x.a; r.b -= (s)*x.b; r.c -= (s)*x.c; r.d -= (s)*x.d; } +#define su2_add(x,r) { r.a += x.a; r.b += x.b; r.c += x.c; r.d += x.d; } +#define su2_add_inv(x,r) { r.a -= x.a; r.b -= x.b; r.c -= x.c; r.d += x.d; } +#define su2_zero(x) x.a = x.b = x.c = x.d = 0.0 +#define su2_one(x) { x.a = x.b = x.c = 0.0; x.d = 1.0; } +#define su2_scalar(s,x) { x.a = x.b = x.c = 0.0; x.d = s; } +#define su2_scalar_add(s,r) { r.d += s; } + +#define mult_su2_vec(m,v,t) mult_su2_nn(m,v,t) +#define mult_su2_vec_sum(m,v,t) mult_su2_add(m,v,t) +#define mult_adj_su2_vec(m,v,t) mult_su2_an(m,v,t) +#define add_su2_vector(x,y,t) add_su2_matrix(x,y,t) +#define scalar_mult_vec(a,s,t) su2_scalar_mul(a,s,t) +#define scalar_mult_add_vec(a,s,t) su2_scalar_mul_add(a,s,t) + +#define mult_su2_nadj(u,t,r) {\ +r.a = u.d*t.a - u.b*t.c + u.c*t.b; \ +r.b = u.d*t.b - u.c*t.a + u.a*t.c; \ +r.c = u.d*t.c - u.a*t.b + u.b*t.a; \ +r.d = - u.c*t.c - u.b*t.b - u.a*t.a; } +#define mult_su2_aadj(u,t,r) {\ +r.a = u.d*t.a + u.b*t.c - u.c*t.b; \ +r.b = u.d*t.b + u.c*t.a - u.a*t.c; \ +r.c = u.d*t.c + u.a*t.b - u.b*t.a; \ +r.d = u.c*t.c + u.b*t.b + u.a*t.a; } +#define mult_su2_adjn(t,u,r) {\ +r.a = u.d*t.a + u.b*t.c - u.c*t.b; \ +r.b = u.d*t.b + u.c*t.a - u.a*t.c; \ +r.c = u.d*t.c + u.a*t.b - u.b*t.a; \ +r.d = - u.c*t.c - u.b*t.b - u.a*t.a; } +#define mult_su2_adja(t,u,r) {\ +r.a = u.d*t.a - u.b*t.c + u.c*t.b; \ +r.b = u.d*t.b - u.c*t.a + u.a*t.c; \ +r.c = u.d*t.c - u.a*t.b + u.b*t.a; \ +r.d = u.c*t.c + u.b*t.b + u.a*t.a; } + +#define project_to_adjoint(u,s) { s.a = u.a; s.b = u.b; s.c = u.c; } +#define adjoint_to_matrix(s,u) { \ + u.d = 0; u.a = s.a; u.b = s.b; u.c = s.c; } + +#define adj_scalar(x,s) x.a = x.b = x.c = (s) +#define adj_sqr(x) (x.a*x.a + x.b*x.b + x.c*x.c) +#define adj_scalar_mul(x,s,r) {r.a = (s)*x.a; r.b = (s)*x.b; r.c = (s)*x.c;} +#define adj_scalar_mul_add(x,s,r) {\ +r.a += (s)*x.a; r.b += (s)*x.b; r.c += (s)*x.c;} +#define adj_scalar_mul_sub(x,s,r) {\ +r.a -= (s)*x.a; r.b -= (s)*x.b; r.c -= (s)*x.c;} +#define add_adjoint(u,t,r) {\ +r.a = u.a + t.a; r.b = u.b + t.b; r.c = u.c + t.c; } +#define sub_adjoint(u,t,r) {\ +r.a = u.a - t.a; r.b = u.b - t.b; r.c = u.c - t.c; } +#define adj_add(t,r) {r.a += t.a; r.b += t.b; r.c += t.c; } +#define adj_sub(t,r) {r.a -= t.a; r.b -= t.b; r.c -= t.c; } +#define adj_zero(x) x.a = x.b = x.c = 0.0 +#define adj_dot(x,y) (x.a*y.a + x.b*y.b + x.c*y.c) +#define adj_cpy(y,x) { x.a = y.a; x.b = y.b; x.c = y.c; } +#define adj_2scalar_mul(x,s,y,t,r) \ +{r.a = (s)*x.a + (t)*y.a; r.b = (s)*x.b + (t)*y.b; r.c = (s)*x.c + (t)*y.c;} + +#define trans_adj_up(u,e,r) { register radix t1,t2,t3; \ + t1 = 2.0*u.d; \ + t3 = t1*u.d - 1.0; \ + t2 = 2.0*(e.a*u.a + e.b*u.b + e.c*u.c); \ + r.a = e.a*t3 + u.a*t2 - t1*(e.b*u.c - e.c*u.b); \ + r.b = e.b*t3 + u.b*t2 - t1*(e.c*u.a - e.a*u.c); \ + r.c = e.c*t3 + u.c*t2 - t1*(e.a*u.b - e.b*u.a);} + +#define trans_adj_down(u,e,r) { register radix t1,t2,t3; \ + t1 = 2.0*u.d; \ + t3 = t1*u.d - 1.0; \ + t2 = 2.0*(e.a*u.a + e.b*u.b + e.c*u.c); \ + r.a = e.a*t3 + u.a*t2 + t1*(e.b*u.c - e.c*u.b); \ + r.b = e.b*t3 + u.b*t2 + t1*(e.c*u.a - e.a*u.c); \ + r.c = e.c*t3 + u.c*t2 + t1*(e.a*u.b - e.b*u.a);} + +/* exp of a matrix: exp(i E) = cos(|E|) + i E/|E| sin(|E|) + */ +#define su2_exp(x,u) { register radix r_t,s_t; \ + r_t = sqrt((double)adj_sqr(x)); \ + if (r_t>0) \ + { s_t = sin((double)r_t)/r_t; adj_scalar_mul(x,s_t,u); \ + u.d = cos((double)r_t); } \ + else su2_one(u); } + +/* log of a matrix: exp(i E) = U -> E = -i log[ U ] + * exp(i E) = cos(|E|) + i s_a E_a/|E| sin(|E|) = U_0 + i s_a U_a + * limit |E| to the interval [0,pi) + */ +#define su2_log(u,x) { register double s_q; register radix r_t; \ + s_q = adj_sqr(u); \ + if (s_q > 0 && u.d < 1.0) { \ + r_t = acos((double)u.d) / sqrt(s_q); adj_scalar_mul(u,r_t,x); } \ + else adj_zero(x); } + +#define su2_normalize(u,v) { register radix r_t; \ + r_t = 1.0/sqrt( (double)su2_sqr(u) ); su2_scalar_mul( u, r_t, v ); } + +/* gaussian_adjoint returns adjoint r with = w^2/2 + * so that if want p~exp( -d r^2 ), use w = 1/sqrt(d) + */ +#define gaussian_adjoint(r,w) { \ + r.a = (w) * gaussian_ran(); \ + r.b = (w) * gaussian_ran(); \ + r.c = (w) * gaussian_ran(); } + +#define metro_su2h( h, scale, r ) { \ +r.a = h.a + scale * (dran() - 0.5); \ +r.b = h.b + scale * (dran() - 0.5); \ +r.c = h.c + scale * (dran() - 0.5); \ +r.d = h.d + scale * (dran() - 0.5); } + +#define su2_R_I( h1, h2, R, I ) { R = an_d(h1,h2); I = -an_c(h1,h2); } +#define su2_isigma3( x, r ) { r.a = -x.b; r.b = x.a; r.c = x.d; r.d = -x.c; } +#define tr2_isigma3(x) (-x.c) + +/* this gives a gaussian vector of width = w^2/2, + * so that if we want distribution exp( -a g^2 ) + * we have to use w = 1/sqrt(a) + */ +#define gaussian_su2_vector( g, t ) { \ + /* double t = 1/sqrt((double)w); */ \ + g.a = t*gaussian_ran(); \ + g.b = t*gaussian_ran(); \ + g.c = t*gaussian_ran(); \ + g.d = t*gaussian_ran(); \ +} + +#define random_su2(u) { register radix t; \ + u.a = dran()-0.5; \ + u.b = dran()-0.5; \ + u.c = dran()-0.5; \ + u.d = dran()-0.5; \ + su2_normalize( u, u ); \ +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/timecheck.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/timecheck.c new file mode 100644 index 0000000000000000000000000000000000000000..f02407b94112bb4243c4c569d9be8fa4d00a6cf6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/timecheck.c @@ -0,0 +1,143 @@ +/************************************************** + * time checking routines + */ + +#include LATDEF + + +/************************************************* + * resource clocks + */ + +static double time_last; + + +#ifdef MPI + +static double time_start; + +void inittime() +{ + time_start = time_last = MPI_Wtime(); +} + +void resettime() +{ + time_last = MPI_Wtime(); +} + +double cputime() +{ + return ( MPI_Wtime() - time_start ); +} + +double added_cpu_time() +{ + double t,t2; + + t = (t2 = MPI_Wtime()) - time_last; + time_last = t2; + return(t); +} + +#else + +double cputime() +{ + struct rusage resource; + extern int getrusage(); + + getrusage(RUSAGE_SELF,&resource); + return(resource.ru_utime.tv_sec + 1e-6*resource.ru_utime.tv_usec + + resource.ru_stime.tv_sec + 1e-6*resource.ru_stime.tv_usec); + +} + +void inittime() +{ + time_last = cputime(); +} + +void resettime() +{ + time_last = cputime(); +} + +double added_cpu_time() +{ + double t,t2; + + t = (t2 = cputime()) - time_last; + time_last = t2; + return(t); +} + +#endif + +/************************************************** + * Timing check routines + */ + +static int interval=0,starttime; +static time_t timelimit; + + +void timecheck(int iteration, int maxiter, int status) +{ + int temp,ttime=0; + + if (this_node == 0) { + ttime = time(NULL); + if (interval <= 0.0) { + interval = ttime - starttime; + printf(" -- approx %d seconds between time checks\n",interval); + fflush(stdout); + interval += 1200; /* leave good time (20 min) for the save etc. */ + } + if (iteration < maxiter && timelimit - ttime - interval < 0) temp = 1; + else temp = 0; + } else temp = 0; + + broadcast_field(&temp,sizeof(int)); + + if (temp == 1) { + dumpall(status,&maxiter); + /* normal exit here */ + if (this_node == 0) + printf("\n **** cpu time exit, remaining time %d seconds\n", + (int)(timelimit - ttime)); + finishrun(); + } +} + +void inittimecheck() +{ + starttime = time(NULL); + interval = 0; + if (this_node == 0) { + printf(" -- Available wallclock time %d seconds\n",(int)(timelimit-starttime)); + fflush(stdout); + } +} + + +int setup_timelimit(time_t t,int argc,char *argv) +{ + int tmp,istimelimit; + + if (this_node == 0) { + if (argc > 0) { + if (sscanf(argv,"%d",&tmp) != 1) + halt("Error reading in time limit"); + timelimit = tmp; /* use tmp to guarantee int */ + if (this_node == 0) + printf(" +++++ wallclock time limit %d seconds\n",(int)timelimit); + istimelimit = 1; + timelimit += t; /* this is the time at the end ... */ + } else istimelimit = 0; + } + + broadcast_int( &istimelimit ); + + return(istimelimit); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/timers.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/timers.c new file mode 100644 index 0000000000000000000000000000000000000000..ec56a1dc961fcf4ce902063bc0c2b246c1911f2f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/timers.c @@ -0,0 +1,65 @@ +#ifdef TIMERS + +#include +#include +#include +#include +#include "comdefs.h" +#include "timers.h" + +double timer_start( timer_type * t ) +{ + struct timeval resource; + + if (this_node == 0) { + gettimeofday(&resource,NULL); + t->start = resource.tv_sec + 1.0e-6*resource.tv_usec; + + /* t->start = clock()*1.0/(CLOCKS_PER_SEC); */ + return(t->start); + } else return(0.0); +} + +double timer_end( timer_type * t ) +{ + double e; + struct timeval resource; + + if (this_node == 0) { + gettimeofday(&resource,NULL); + e = resource.tv_sec + 1.0e-6*resource.tv_usec; + /* e = clock()*1.0/(CLOCKS_PER_SEC); */ + + t->total += (e - t->start); + t->count++; + return(e); + } else return(0.0); +} + +void timer_report( timer_type * t ) +{ + struct timeval resource; + + if (this_node == 0) { + gettimeofday(&resource,NULL); + /* time used during the counter activity */ + t->initial = resource.tv_sec + 1.0e-6*resource.tv_usec - t->initial; + if (t->count) + printf(" total %g sec, %d calls, %g usec/call, fraction %.2g of time\n", + t->total, t->count, 1e6 * t->total/t->count, t->total/t->initial ); + else + printf(" no timed calls made\n"); + } +} + +void timer_reset( timer_type * t ) { + struct timeval resource; + + t->total = t->count = t->start = 0; + if (this_node == 0) { + gettimeofday(&resource,NULL); + t->initial = resource.tv_sec + 1.0e-6*resource.tv_usec; + } +} + +#endif /* timers */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/timers.h b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/timers.h new file mode 100644 index 0000000000000000000000000000000000000000..ce7cae3e46d571f1b05724905fcfeaad0252a0e5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/timers.h @@ -0,0 +1,22 @@ + +#ifdef TIMERS + +typedef struct { + double total, start, initial; /* cumulated, work, and initial timeval */ + int count; +} timer_type; + + +double timer_start( timer_type * ); +double timer_end( timer_type * ); +void timer_reset( timer_type * t ); +void timer_report( timer_type * ); + +#else + +#define timer_start(a) +#define timer_end(a) +#define timer_reset(a) +#define timer_report(a) + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/Make_template b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/Make_template new file mode 100644 index 0000000000000000000000000000000000000000..a855f7b31fa7fd0aed655a99061f898a82a09cd3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/Make_template @@ -0,0 +1,47 @@ +# Makefile for Libraries for QCD programs +# +# This template file defines rules and macros common to all architectures +# It is intended to be an include file for other Makefiles. +# Don't use it by itself! + +.c.o: + $(MPI_CC) $(CFLAGS) -c $*.c + +all: complex.a su3.a + +COMPLEXOBJS = cadd.o cdiv.o ce_itheta.o cexp.o clog.o cmplx.o cmul.o \ + conjg.o csqrt.o csub.o dcadd.o dcdiv.o dce_itheta.o \ + dcexp.o dclog.o dcmplx.o dcmul.o dconjg.o dcsqrt.o dcsub.o +$(COMPLEXOBJS) : complex.h +complex.a: complex.h $(COMPLEXOBJS) + $(AR) $(ARFLAGS) complex.a $(COMPLEXOBJS) + +SU3OBJS = addmat.o addvec.o cmp_ahmat.o cs_m_a_vec.o cs_m_a_mat.o cs_m_s_vec.o \ + cs_m_vec.o det_su3.o clear_mat.o dumpmat.o dumpvec.o clearvec.o gaussrand.o \ + m_amatvec_s.o m_amatvec.o m_amatvec_ns.o m_mat_an.o \ + m_mat_na.o m_mat_nn.o m_matvec.o m_matvec_ns.o m_matvec_s.o \ + make_ahmat.o rand_ahmat.o realtr.o complextr.o \ + s_m_a_mat.o s_m_a_vec.o s_m_s_mat.o s_m_s_vec.o s_m_sum_vec.o \ + s_m_vec.o s_m_mat.o cs_m_mat.o cs_m_s_mat.o \ + su3_adjoint.o su3_dot.o su3_rdot.o su3_proj.o su3mat_copy.o \ + su3vec_copy.o \ + submat.o subvec.o trace_su3.o uncmp_ahmat.o \ + msq_su3vec.o sub4vecs.o m_amv_4dir.o m_amv_4dir_2.o m_mv_s_4dir.o \ + flush_to_zero.o +#WILSON_OBJS = wp_shrink.o wp_grow.o wp_grow_a.o dump_wvec.o clear_wvec.o \ +# su3_proj_w.o copy_wvec.o add_wvec.o sub_wvec.o s_m_wvec.o \ +# s_m_hwvec.o msq_wvec.o wvec_dot.o wvec2_dot.o wvec_rdot.o \ +# s_m_a_wvec.o s_m_atm_wvec.o mb_gamma.o mb_gamma_l.o mb_gamma_r.o \ +# cs_m_a_wvec.o cs_m_a_wvec2.o \ +# m_mat_wvec.o m_mat_hwvec.o m_amat_wvec.o m_amat_hwvec.o \ +# grow4wvecs.o wp_shrink4.o +$(SU3OBJS) : complex.h su3.h +#$(WILSON_OBJS) : complex.h su3.h +su3.a:: su3.h $(SU3OBJS) + $(AR) $(ARFLAGS) su3.a $(SU3OBJS) +#su3.a:: su3.h $(WILSON_OBJS) +# $(AR) $(ARFLAGS) rcs su3.a $(WILSON_OBJS) + +clean: + $(RM) *.o + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/Make_vanilla b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/Make_vanilla new file mode 100644 index 0000000000000000000000000000000000000000..5070b2e2a808b7248e5d77d70ca8fc5fe37f72a7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/Make_vanilla @@ -0,0 +1,32 @@ +# Makefile for Libraries for QCD programs +# +# Library routines involve no communication, so are compiled +# as for a scalar processor. + +# +# This Makefile builds the purely C coded versions and should work +# for all architectures with a suitable choice of CFLAGS and +# CC below + +# The FAST option uses "fast" variants of the code, where available, and is +# recommended. The fast variants are C-coded with explicit loop unrolling +# and inlining. + +#CFLAGS = -O3 -DFAST -Wall -Wmissing-prototypes #gnu c compiler +#CFLAGS = -O -DFAST -float #Dec alpha compiler +#CFLAGS = -O -f -DFAST #Mips +CFLAGS = -O3 -DFAST -DNATIVEDOUBLE #IBM RS6000 (optimized) +#CFLAGS = -g -Wall -DFAST -DNATIVEDOUBLE #IBM RS6000 (profile/debug) +#CFLAGS = -DSGI -O3 -mips4 -64 -OPT:IEEE_arithmetic=3:roundoff=3:alias=restrict -TENV:X=1 -DFAST -DNATIVEDOUBLE # SGI Origin 2000 (from UCSB) +#CFLAGS = -O3 -mips4 -r10000 -OPT:IEEE_arithmetic=3:roundoff=3:alias=restrict -TENV:X=1 -DFAST -DNATIVEDOUBLE #NCSA SGI PC (untested for MILC version 5) +#CFLAGS= -O5 -dalign -libmil -fsimple=2 -fns #SUN Ultra +#CFLAGS = -O -DFAST -float sp_ops -float sp_const -noautopar #Convex Exemplar (untested for MILC version 5) + + +CC = cc #most +#CC = xlc #IBM RS6000 ANSI C +#CC = gcc #gnu c compiler + +include Make_template + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..146791ff96b24704168ca9dcce895fd36ebc6958 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/Makefile @@ -0,0 +1,31 @@ +# Makefile for Libraries for QCD programs +# +# Library routines involve no communication, so are compiled +# as for a scalar processor. + +# The specialized Makefiles are for processors for which we +# have some assembly coded substitutes. +# +# This Makefile builds the purely C coded versions and should work +# for all architectures with a suitable choice of CFLAGS and +# MPI_CC. + +# The FAST option uses "fast" variants of the code, where available, and is +# recommended. The fast variants are C-coded with explicit loop unrolling +# and inlining. + +include ../Makefile.defs + +CFLAGS += -DFAST + +#CFLAGS += -DFAST #gnu c compiler +#CFLAGS += -O -DFAST -float #Dec alpha compiler +#CFLAGS += -O -f -DFAST #Mips +#CFLAGS += -O3 -DFAST -DNATIVEDOUBLE #IBM RS6000 (optimized) +#CFLAGS += -g -Wall -DFAST -DNATIVEDOUBLE #IBM RS6000 (debug/profile) +#CFLAGS += -DSGI -O3 -mips4 -64 -OPT:IEEE_arithmetic=3:roundoff=3:alias=restrict -TENV:X=1 -DFAST -DNATIVEDOUBLE # SGI Origin 2000 (from UCSB) +#CFLAGS += -O3 -mips4 -r10000 -OPT:IEEE_arithmetic=3:roundoff=3:alias=restrict -TENV:X=1 -DFAST -DNATIVEDOUBLE #NCSA SGI PC (untested for MILC version 5) +#CFLAGS += -O5 -dalign -libmil -fsimple=2 -fns #SUN Ultra +#CFLAGS += -O -DFAST -float sp_ops -float sp_const -noautopar #Convex Exemplar (untested for MILC version 5) + +include Make_template diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/add_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/add_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..4c9e5cc4bd72c484b4cebe82cca0cfeddd7207ea --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/add_wvec.c @@ -0,0 +1,14 @@ +/******************** add_wvec.c (in su3.a) ******************** +* +*void add_wilson_vector(wilson_vector *src1,*src2,*dest) +* add two Wilson vectors +* dest <- src1 + src2 +*/ +#include "complex.h" +#include "su3.h" + +void add_wilson_vector( wilson_vector *src1, wilson_vector *src2, + wilson_vector *dest ){ + register int i; + for(i=0;i<4;i++)add_su3_vector( &(src1->d[i]), &(src2->d[i]), &(dest->d[i])); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/addmat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/addmat.c new file mode 100644 index 0000000000000000000000000000000000000000..9a493a809e67b84619c582ac8b86bd09109f013f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/addmat.c @@ -0,0 +1,13 @@ +/******************** addmat.c (in su3.a) ***************************** +* * +* Add two SU3 matrices * +*/ +#include "complex.h" +#include "su3.h" + +void add_su3_matrix( su3_matrix *a, su3_matrix *b, su3_matrix *c ) { +register int i,j; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + CADD( a->e[i][j], b->e[i][j], c->e[i][j] ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/addvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/addvec.c new file mode 100644 index 0000000000000000000000000000000000000000..b5d07e363483dbe2f90f30f4303e144890325607 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/addvec.c @@ -0,0 +1,13 @@ +/******************** addvec.c (in su3.a) ***************************** +* * +* Add two SU3 vectors * +*/ +#include "complex.h" +#include "su3.h" + +void add_su3_vector( su3_vector *a, su3_vector *b, su3_vector *c ){ +register int i; + for(i=0;i<3;i++){ + CADD( a->c[i], b->c[i], c->c[i] ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/asdef.alpha.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/asdef.alpha.h new file mode 100644 index 0000000000000000000000000000000000000000..aaf841a898ff967017bf80a6af87147d73e6fb78 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/asdef.alpha.h @@ -0,0 +1,65 @@ +#define v0 $0 /*Integer return value register*/ +#define t0 $1 /*Integer scratch registers (caller saved)*/ +#define t1 $2 +#define t2 $3 +#define t3 $4 +#define t4 $5 +#define t5 $6 +#define t6 $7 +#define t7 $8 +#define s0 $9 /*Integer save registers (callee saved)*/ +#define s1 $10 +#define s2 $11 +#define s3 $12 +#define s4 $13 +#define s5 $14 +#define fp $15 /*Private frame pointer register*/ +#define a0 $16 /*Integer argument registers*/ +#define a1 $17 +#define a2 $18 +#define a3 $19 +#define a4 $20 +#define a5 $21 +#define t8 $22 /*Scratch registers (continued)*/ +#define t9 $23 +#define t10 $24 +#define t11 $25 +#define ra $26 /*Return address register*/ +#define t12 $27 /*Scratch registers (continued)*/ +#define at $28 #reserved for assembler +#define gp $29 /*global pointer*/ +#define sp $30 /*Stack pointer register*/ +#define zero $31 /*Integer ReadAsZero/Sink register*/ + +#define fv0 $f0 /*Floating-point return value register*/ +#define fv1 $f1 +#define fs0 $f2 /*Floating-point save registers (callee saved)*/ +#define fs1 $f3 +#define fs2 $f4 +#define fs3 $f5 +#define fs4 $f6 +#define fs5 $f7 +#define fs6 $f8 +#define fs7 $f9 +#define ft0 $f10 /*Floating-point scratch registers*/ +#define ft1 $f11 +#define ft2 $f12 +#define ft3 $f13 +#define ft4 $f14 +#define ft5 $f15 +#define fa0 $f16 /*Floating-point argument registers*/ +#define fa1 $f17 +#define fa2 $f18 +#define fa3 $f19 +#define fa4 $f20 +#define fa5 $f21 +#define ft6 $f22 /*Floating-point scratch registers (continued)*/ +#define ft7 $f23 +#define ft8 $f24 +#define ft9 $f25 +#define ft10 $f26 +#define ft11 $f27 +#define ft12 $f28 +#define ft13 $f29 +#define ft14 $f30 +#define fzero $f31 /*Floating-point ReadAsZero/Sink register*/ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cadd.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cadd.c new file mode 100644 index 0000000000000000000000000000000000000000..8e6b5e8f7c0d4b6957c1c67d7f50bcdd8fc88d78 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cadd.c @@ -0,0 +1,10 @@ +/* Subroutines for operations on complex numbers */ +/* add two complex numbers */ +#include "complex.h" + +complex cadd( complex *a, complex *b ) { + complex c; + c.real = (*a).real + (*b).real; + c.imag = (*a).imag + (*b).imag; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cdiv.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cdiv.c new file mode 100644 index 0000000000000000000000000000000000000000..a8d59a0e72db0711d267af6d2bdb1829b77bbdb2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cdiv.c @@ -0,0 +1,12 @@ +/* Subroutines for operations on complex numbers */ +/* Divide two complex numbers */ +#include "complex.h" + +complex cdiv( complex *a, complex *b ) { + complex c; + radix scale; + scale = 1.0/((*b).real*(*b).real+(*b).imag*(*b).imag); + c.real = scale*((*a).real*(*b).real + (*a).imag*(*b).imag); + c.imag = scale*((*a).imag*(*b).real - (*a).real*(*b).imag); + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/ce_itheta.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/ce_itheta.c new file mode 100644 index 0000000000000000000000000000000000000000..7eced480a1847556b95eacdc6ac4a7e3c372acfe --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/ce_itheta.c @@ -0,0 +1,12 @@ +/* Subroutines for operations on complex numbers */ +/* exp( i*theta ) */ +#include +#include "complex.h" + +complex ce_itheta( radix theta ){ + complex c; + c.real = (radix)cos( (double)theta ); + c.imag = (radix)sin( (double)theta ); + /* there must be a more efficient way */ + return( c ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cexp.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cexp.c new file mode 100644 index 0000000000000000000000000000000000000000..501be0be8747ca04df3a66eeebe02b7ac887d63f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cexp.c @@ -0,0 +1,13 @@ +/* Subroutines for operations on complex numbers */ +/* complex exponential */ +#include +#include "complex.h" + +complex cexp( complex *a ){ + complex c; + radix mag; + mag = (radix)exp( (double)(*a).real ); + c.real = mag*(radix)cos( (double)(*a).imag ); + c.imag = mag*(radix)sin( (double)(*a).imag ); + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clear_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clear_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..33a4df85e0b07e2406a20b702abd6ecd6110d2a7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clear_mat.c @@ -0,0 +1,15 @@ +/******************** clear_mat.c (in su3.a) ******************** +* +*void clear_su3mat( su3_matrix *dest ) +* clear an SU3 matrix +* dest <- zero_matrix +*/ +#include "complex.h" +#include "su3.h" + +void clear_su3mat( su3_matrix *dest ){ +register int i,j; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + dest->e[i][j].real = dest->e[i][j].imag = 0.0; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clear_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clear_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..65fbed79f52f5283d4e501c7a14e215a2f53ab48 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clear_wvec.c @@ -0,0 +1,15 @@ +/******************** clear_wvec.c (in su3.a) ******************** +* +*void clear_wilson_vector( wilson_vector *dest ) +* clear a Wilson vector +* dest <- zero_vector +*/ +#include "complex.h" +#include "su3.h" + +void clear_wvec( wilson_vector *dest ){ +register int i,j; + for(i=0;i<4;i++)for(j=0;j<3;j++){ + dest->d[i].c[j].real = dest->d[i].c[j].imag = 0.0; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clearvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clearvec.c new file mode 100644 index 0000000000000000000000000000000000000000..4072c9254ab726a553d1215b692a39acc527ee1d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clearvec.c @@ -0,0 +1,14 @@ +/******************* clearvec.c (in su3.a) ***************************** +* * +* void clearvec( su3_vector *vec ) * +* print out a 3 element complex vector * +*/ +#include "complex.h" +#include "su3.h" + +void clearvec( su3_vector *v ) +{ + v->c[0].real = v->c[0].imag = 0.0; + v->c[1].real = v->c[1].imag = 0.0; + v->c[2].real = v->c[2].imag = 0.0; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clog.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clog.c new file mode 100644 index 0000000000000000000000000000000000000000..deead044aec545268ea79bf598236820e5fdaefb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clog.c @@ -0,0 +1,11 @@ +/* Subroutines for operations on complex numbers */ +/* complex logarithm */ +#include +#include "complex.h" + +complex clog( complex *a ){ + complex c; + c.real = 0.5*(radix)log((double)((*a).real*(*a).real+(*a).imag*(*a).imag)); + c.imag = (radix)atan2( (double)(*a).imag, (double)(*a).real ); + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cmp_ahmat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cmp_ahmat.c new file mode 100644 index 0000000000000000000000000000000000000000..1cf75efb2091fc2596f8a859e246d11add32cda8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cmp_ahmat.c @@ -0,0 +1,20 @@ +/***************** cmp_ahmat.c (in su3.a) ***************************** +* * +* Make an anti_hermitmat (anti Hermitian matrix in compressed form) * +* from an SU3 matrix (3x3 complex matrix). * +*/ +#include "complex.h" +#include "su3.h" + +void compress_anti_hermitian( su3_matrix *mat_su3, + anti_hermitmat *mat_antihermit ) { + mat_antihermit->m00im=mat_su3->e[0][0].imag; + mat_antihermit->m11im=mat_su3->e[1][1].imag; + mat_antihermit->m22im=mat_su3->e[2][2].imag; + mat_antihermit->m01.real=mat_su3->e[0][1].real; + mat_antihermit->m02.real=mat_su3->e[0][2].real; + mat_antihermit->m12.real=mat_su3->e[1][2].real; + mat_antihermit->m01.imag=mat_su3->e[0][1].imag; + mat_antihermit->m02.imag=mat_su3->e[0][2].imag; + mat_antihermit->m12.imag=mat_su3->e[1][2].imag; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cmplx.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cmplx.c new file mode 100644 index 0000000000000000000000000000000000000000..d7a513beaef9acad3cd0fadf4d2d28dd50db4232 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cmplx.c @@ -0,0 +1,9 @@ +/* Subroutines for operations on complex numbers */ +/* make a complex number from two real numbers */ +#include "complex.h" + +complex cmplx( radix x, radix y ) { + complex c; + c.real = x; c.imag = y; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cmul.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cmul.c new file mode 100644 index 0000000000000000000000000000000000000000..8b5b3a2f0f6ec0877200cd414bf8785dc901fa10 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cmul.c @@ -0,0 +1,10 @@ +/* Subroutines for operations on complex numbers */ +/* multiply two complex numbers */ +#include "complex.h" + +complex cmul( complex *a, complex *b ) { + complex c; + c.real = (*a).real * (*b).real - (*a).imag * (*b).imag; + c.imag = (*a).imag * (*b).real + (*a).real * (*b).imag; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/comdefs.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/comdefs.h new file mode 100644 index 0000000000000000000000000000000000000000..2d8dff178841f98b702a0e473e554679d0e2a154 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/comdefs.h @@ -0,0 +1,410 @@ +/************************* comdefs.h *************************************/ + +/* Definitions for communications for the SU3 program on the Intel machine, + version 4. + + Communications routines will assume that the lattice is stored as an + array of structures of type "site". + + MODIFIED FOR 3D CODE Kari Rummukainen 1997 +*/ + +/* message types (not all are used on all machines) */ +#define PARAM_TYPE 11 /* type of parameter message to nodes */ +#define FIELD_TYPE 12 /* type of field sent from one node to another */ +#define BROADCAST_FLOAT_TYPE 13 /* broadcast of radixing point number */ +#define BROADCAST_DOUBLE_TYPE 14 /* broadcast of double */ +#define BROADCAST_COMPLEX_TYPE 15 /* broadcast of single precision complex */ +#define BROADCAST_DCOMPLEX_TYPE 16 /* broadcast of double precision complex */ +#define SEND_INTEGER_TYPE 17 /* send an integer to one other node */ +#define CM_GATHER_TYPE 18 /* type for CM5 (cooperative) messages */ +#define SYNC_TYPE 50 /* Synchronize all nodes */ +#define SUM_FLOAT_TYPE 51 /* Sum radix over all nodes */ +#define SUM_DOUBLE_TYPE 52 /* Sum double over all nodes */ +#define SUM_COMPLEX_TYPE 53 /* Sum complex over all nodes */ +#define SUM_DCOMPLEX_TYPE 54 /* Sum double_complex over all nodes */ +#define MAX_FLOAT_TYPE 55 /* Maximum radix over all nodes */ +#define MAX_DOUBLE_TYPE 56 /* Maximum double over all nodes */ + +/* For MPI, need to include mpi.h to define MPI_Request */ +/* For MPI on paragon, mynode() and numnodes() can't replace nx routines */ +#ifdef MPI +#include +#define mynode MILC_mynode +#define numnodes MILC_numnodes +#define dclock MILC_dclock +#endif + +/* Added for pvm */ +#ifdef PVM +#define ANY_MSG -1 /* Any message */ + +/* Message structures for communication between host and nodes */ +/* The first two fields must always be the same for each type */ +/* The basic structure must be the shortest */ + +/* For most messages */ +struct hcs_basic { + int msg_type; + int node; /* integer identifies caller's instance */ + int arg1,arg2,arg3; /* Use depends on which routine */ +} ; +#define HCS_BASIC_SIZE (sizeof(struct hcs_basic)) +#ifdef PROTO +int put_hcs_basic( struct hcs_basic * hcs ); +int get_hcs_basic( struct hcs_basic * hcs ); +#endif + +/* For printf and scanf calls */ +#define STRING_LENGTH 256 +struct hcs_stdio { + int msg_type; + int node; /* integer identifies caller's instance */ + int length; + char s[STRING_LENGTH]; +} ; +#define HCS_STDIO_SIZE (sizeof(struct hcs_stdio)) +#ifdef PROTO +int put_hcs_stdio( struct hcs_stdio * hcs ); +int get_hcs_stdio( struct hcs_stdio * hcs ); +#endif + +/* For initialization call */ +#define MAX_NUMBER_NODES 8 +#define HOST_NAME_LENGTH 128 +struct hcs_ident { + int msg_type; + int node; /* integer identifies caller's instance */ + int your_node; /* logical node number for this node */ + int number_nodes; /* how many nodes for this partition */ + int node_instance[MAX_NUMBER_NODES]; /* instance number for logical node */ + char host_name[HOST_NAME_LENGTH]; /* name of host */ +} ; +#define HCS_IDENT_SIZE (sizeof(struct hcs_ident)) +#ifdef PROTO +int put_hcs_ident( struct hcs_ident * hcs ); +int get_hcs_ident( struct hcs_ident * hcs ); +#endif + +union { + struct hcs_basic basic; + struct hcs_stdio stdio; + struct hcs_ident ident; +} hcs; + +#define HOST_CALL 77 /* pvm message type for call to host for service */ +#define HOST_REPLY 87 + +/* Message subtypes internal to host-node service calls */ +/* (Not used by pvm to identify messages) */ +#define PRINTF_HOST_CALL 11 +#define SCANF_HOST_CALL 12 +#define FPRINTF_HOST_CALL 13 /* Not used */ +#define FSCANF_HOST_CALL 14 /* Not used */ +#define FFLUSH_HOST_CALL 20 +#define FOPEN_HOST_CALL 30 /* Not used */ +#define FCLOSE_HOST_CALL 31 /* Not used */ +#define OPEN_HOST_CALL 32 /* Not used */ +#define CLOSE_HOST_CALL 33 /* Not used */ +#define CREAT_HOST_CALL 34 /* Not used */ +#define READ_HOST_CALL 40 /* Not used */ +#define WRITE_HOST_CALL 41 /* Not used */ +#define NODE_IDENT_CALL 78 +#define NODES_DONE_HOST_CALL 99 + +/* end of pvm additions */ +#endif /* end ifdef PVM */ + +/* Added for pvm */ +#ifdef PVM24 +#define ANY_MSG -1 /* Any message */ + +/* Message structures for communication between host and nodes */ +/* The first two fields must always be the same for each type */ +/* The basic structure must be the shortest */ + +/* For initialization call */ +#define MAX_NUMBER_NODES 8 +#define HOST_NAME_LENGTH 128 +struct hcs_ident_struct { + int msg_type; + int node; /* integer identifies caller's instance */ + int your_node; /* logical node number for this node */ + int number_nodes; /* how many nodes for this partition */ + int node_instance[MAX_NUMBER_NODES]; /* instance number for logical node */ + char host_name[HOST_NAME_LENGTH]; /* name of host */ +} hcs_ident ; +#define HCS_IDENT_SIZE (sizeof(struct hcs_ident_struct)) +#ifdef PROTO +int put_hcs_ident( struct hcs_ident_struct * hcs ); +int get_hcs_ident( struct hcs_ident_struct * hcs ); +#endif + +#define NODE_IDENT_CALL 77 +#define terminate g_terminate /* Because of name conflict */ + +/* end of pvm version 2.4 additions */ +#endif /* end ifdef PVM24 */ + + +/* Added for pvm */ +#ifdef PVM3 +#define ANY_MSG -1 /* Any message */ +#define ANY_NODE -1 /* Any node */ + +/* Message structures for communication between host and nodes */ + +/* For initialization call */ +#define MAX_NUMBER_NODES 8 +#define HOST_NAME_LENGTH 128 +struct hcs_ident_struct { + int msg_type; + int node; /* integer identifies caller's instance */ + int your_node; /* logical node number for this node */ + int number_nodes; /* how many nodes for this partition */ + int node_tid[MAX_NUMBER_NODES]; /* instance number for logical node */ + char host_name[HOST_NAME_LENGTH]; /* name of host */ +} hcs_ident ; +#define HCS_IDENT_SIZE (sizeof(struct hcs_ident_struct)) +#ifdef PROTO +int put_hcs_ident( struct hcs_ident_struct * hcs ); +int get_hcs_ident( struct hcs_ident_struct * hcs ); +#endif + +#define NODE_IDENT_CALL 77 + +/* end of pvm version 3 additions */ +#endif /* end ifdef PVM3 */ + +/* Added for MPL */ +#ifdef MPL +#define MPL_NOT_COMPLETED -1 /* For mpc_status */ +#define MPL_INACTIVE -2 /* For mpc_status */ +#define DONTCARE -1 /* Any message or any node */ +int mperrno; /* Used for error reporting */ +/* The following are from /usr/lpp/poe/include/mpproto.h */ +extern int mpc_environ(int *howmany,int *whoami); +extern int mpc_stopall(int errcode); +extern int mpc_group(int gsize,int glist[],int label,int *gid); +extern int mpc_send(char *sarr,int len,int dest,int type,int *msgid); +extern int mpc_recv(char *darr,int len,int *src,int *type,int *msgid); +extern int mpc_bsend(char *sarr,int len,int dest,int type); +extern int mpc_brecv(char *darr,int len,int *src,int *type,int *nbytes); +extern int mpc_status(int msgid); +extern int mpc_wait(int *msgid,int *nbytes); +extern int mpc_sync(int gid); + +extern void s_vadd(radix in1[],radix in2[],radix out[],int *len); +extern void d_vadd(double in1[],double in2[],double out[],int *len); +extern void s_vmax(radix in1[],radix in2[],radix out[],int *len); +extern void d_vmax(double in1[],double in2[],double out[],int *len); +#define MAX_NUMBER_NODES 64 +#define HOST_NAME_LENGTH 128 + +/* end of MPL additions */ +#endif /* end ifdef MPL */ + +#define FIELD_REQUEST 100 /* used by field_pointer...() */ +#define FIELD_REPLY 101 /* used by field_pointer...() */ + +#define N_S_GATHERS 5 /* max number of concurrent sparse gathers */ +#define SPARSE_GATHER_BASE 1000 +#define SPARSE_GATHER_STEP 1000 + +#define GENERAL_GATHER_BASE 6000 /* types from this to this+number_of_nodes + are used by the general_gather routines */ +#define GATHER_BASE 7000 /* types greater than or equal to this are used + by the gather routines */ +/* pid */ +#define NODE_PID 0 +#define HOST_PID 0 +#define ALL_NODES -1 /* works for Ncube, Intel */ /* Don't use with pvm */ + +/* definitions of restore and save lattice commands */ +#define CONTINUE 10 +#define FRESH 11 +#define RELOAD_ASCII 12 +#define RELOAD_BINARY 13 +#define RELOAD_CHECKPOINT 14 +#define FORGET 20 +#define SAVE_ASCII 21 +#define SAVE_BINARY 22 +#define SAVE_CHECKPOINT 23 + +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 5 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define ZDOWN 3 +#define YDOWN 4 +#define XDOWN 5 + +#define OPP_DIR(dir) (5-(dir)) /* Opposite direction */ +#define NDIRS 6 /* number of directions */ + +#define MAX_GATHERS 24 /* Maximum number of gather tables */ + +/* arguments to the make_gather() routine */ +#define FORWARDS 1 +#define BACKWARDS (-1) /* BACKWARDS = -FORWARDS */ +#define OWN_INVERSE 0 +#define WANT_INVERSE 1 +#define NO_INVERSE 2 +#define ALLOW_EVEN_ODD 0 +#define NO_EVEN_ODD 1 +#define SAME_PARITY 0 +#define SWITCH_PARITY 1 +#define SCRAMBLE_PARITY 2 + +/* "comlink" is the basic structure used in gathering neighboring sites. + Each node will maintain one such structure for each direction for each + (other) node that contains sites that are neighbors of the sites on + this node. For example, if the XUP neighbors of sites on this node + are found on two other nodes, then this node will maintain a linked + list of two comlink structures for gathering from the XUP direction. +*/ +struct comlink { + /* pointer to next in list, NULL if this is last */ + struct comlink *nextcomlink; + /* number of the node to which we connect */ + int othernode; + /* number of even sites on this node that have neighbors on + other node connected by this "comlink", and same for odd + sites on this node. */ + int n_even_connected, n_odd_connected; + /* Address of list of indices of even sites (on this node) + whose neighbors are found through this comlink, same for odd. + The odd list follows the even list, so to get all sites you + start at esitelist and take n_even_connected+n_odd_connected + addresses. */ + /* When the comlink is describing sites to be sent, the "odd" + list lists sites whose neighbors are even. This convention + is natural for the nearest neighbor gathers. For gathers + which don't allow even and odd site gathers, the even list + is used for list of sites to be received and the odd + list for sites to be sent. Different comlink structures + may point to the same list. For example, the receive list + for one gather may be a send list for the opposite gather. */ + int *esitelist, *ositelist; +}; +typedef struct comlink comlink; + + +/* Structure to keep track of outstanding sends and receives */ +typedef struct { + /* node sending or receiving message */ + int msg_node; + /* size of message in bytes */ + int msg_size; + /* address of buffer malloc'd for message */ + char *msg_buf; + /* message id returned by system call */ +#ifdef MPI + MPI_Request msg_id; +#else + int msg_id; +#endif +#if defined(PVM) || defined(PVM24) || defined(PVM3) || defined(MPL) + int msg_OK; + /* flag to track the asynchronous arrival of messages */ +#endif +#ifdef MPL + int mpl_msgid; + /* MPL assigned message id for checking status with mpc_status */ +#endif +} msg_tag; + +/* Structure for requesting a field from another node */ +typedef struct { + int field; /* offset of field in site */ + int size; /* size of field */ + int index; /* index of field on other node */ +} msg_request; + + +/* Communications routines */ +void send_parameters(params *); +void get_parameters(params *); +void send_field(void *,int,int); +void get_field(void *,int); +char * machine_type(); +#ifndef PARAGON +int mynode(); +int numnodes(); +#endif +void g_sync(); +void g_radixsum(radix *); +void g_vecradixsum(radix *,int); +void g_doublesum(double *); +void g_vecdoublesum(double *,int); +void g_radixmax(radix *); +void g_doublemax(double *); +void g_complexsum(complex *); +void g_veccomplexsum(complex *,int); +void g_dcomplexsum(double_complex *); +void broadcast_field(void *p,int siz); +void broadcast_radix(radix *); +void broadcast_double(double *); +void broadcast_complex(complex *); +void broadcast_dcomplex(double_complex *); +void send_integer(int node,int *address); +void receive_integer(int *); +double dclock(); +int terminate(); int finishrun(); + +msg_tag *start_gather(field_offset,int size,int index,int parity,char **dest); +void wait_gather(msg_tag *mbuf); +void cleanup_gather(msg_tag *mbuf); +msg_tag *start_general_gather(field_offset f,int size,int *displacement, + int parity,char **dest); +void wait_general_gather(msg_tag *m); +void cleanup_general_gather(msg_tag *m); +char *field_pointer_at_coordinates( field_offset f, int size, + int x,int y,int z); +char *field_pointer_at_direction( field_offset f,int size, site *s, + int direction ); +void cleanup_field_pointer(char *b); + +msg_tag *start_sparse_gather(field_offset f,int size,int *displacement, + int xb, int yb ,char **dest); +void wait_sparse_gather(msg_tag *m); +void cleanup_sparse_gather(msg_tag *m); + + +/* Each node maintains a list of headers to lists of comlinks */ +/**EXTERN comlink * neighborlist[NDIRS];**/ +/* addresses of neighboring sites, NULL if off-node */ +/**EXTERN site ** neighbor[NDIRS];**/ + +#ifdef NONSENSE +#ifdef PE_CODE +#include +/* On CM5, redefine system calls */ +#define main CMPE_control +#define printf host_printf +#define scanf host_scanf +#define fprintf host_fprintf +#define fscanf host_fscanf +#define fflush host_fflush +#define fopen host_fopen +#define fclose host_fclose +#define open host_open +#define close host_close +#define creat host_creat +#define read host_read +#define write host_write +#endif +#endif + +#ifdef PVM +#ifndef HOST_CODE +#define terminate g_terminate /* Because of name conflict */ +#define scanf host_scanf +#define printf host_printf +#define fflush host_fflush +#endif +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/complex.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/complex.h new file mode 100644 index 0000000000000000000000000000000000000000..2bb95437d8c9d8627a3a183257916e5fe90427af --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/complex.h @@ -0,0 +1,200 @@ +/*============================================================================*/ +/* */ +/* Complex Numbers */ +/* */ +/* Typedefs are included for type complex (single-precision) and type */ +/* double_complex (double_precision) complex numbers. At this time, the */ +/* functions cannot be overloaded, so there are separate routines for the */ +/* single and double precision types. All of the macros, however, will work */ +/* with both types and mix types freely. */ +/* */ +/* The following functions are provided in single and double precision: */ +/* */ +/* complex cmplx(radix r, radix i); (r,i) */ +/* complex cadd(complex *a, complex *b); *a + *b */ +/* complex cmul(complex *a, complex *b); *a * *b */ +/* complex csub(complex *a, complex *b); *a - *b */ +/* complex cdiv(complex *a, complex *b); *a / *b */ +/* complex conjg(complex *a); conjugate of *a */ +/* complex cexp(complex *a); exp(*a) */ +/* complex clog(complex *a); ln(*a) */ +/* complex csqrt(complex *a); sqrt(a) */ +/* complex ce_itheta(radix theta); exp(i*theta) */ +/* */ +/* The following macros are provided, which work for BOTH single and double */ +/* precision and for mixtures: */ +/* */ +/* 1) Macros which appear to return values (radix or double, as appropriate): */ +/* cabs(*a) magnitude of the complex number *a */ +/* cabs_sq(*a) square of the magnitude (faster than cabs) */ +/* carg(*a) phase of the complex number *a */ +/* CABS(a) returns |a| */ +/* CABS_SQ(a) returns |a|^2 */ +/* CARG(a) returns phase of a */ +/* CRDOT(a,b) returns real part of conjg(a)*b */ +/* */ +/* 2) Macro to convert from single to double or double to single: */ +/* set_complex_equal(*a,*b) do *b=*a by components to convert */ +/* SET_COMPLEX(a,b,c) do c = a + ib */ +/* */ +/* 3) Macros for fast in-line operations: */ +/* CZERO(a) a = 0 */ +/* CONJG(a,b) b = conjg(a) */ +/* CADD(a,b,c) c = a + b */ +/* CSUM(a,b) a += b */ +/* CSUB(a,b,c) c = a - b */ +/* CMUL(a,b,c) c = a * b */ +/* CMUL_ADD(a,b,c) c += a * b = CMUL_SUM */ +/* CINV(a,b) b = 1 / a */ +/* CDIV(a,b,c) c = a / b */ +/* CMUL_J(a,b,c) c = a * conjg(b) _ADD */ +/* CMULJ_(a,b,c) c = conjg(a) * b _ADD */ +/* CMULJJ(a,b,c) c = conjg(a*b) _ADD */ +/* CNEGATE(a,b) b = -a */ +/* CMUL_I(a,b) b = ia */ +/* CMUL_MINUS_I(a,b) b = -ia */ +/* CMULREAL(a,b,c) c = ba with b real and a complex */ +/* CDIVREAL(a,b,c) c = a/b with a complex and b real */ +/* */ +/*============================================================================*/ + +/* Get in radix definition */ +#include "radix.h" + +/* On the paragon, under OSF, complex is defined in math.h, but not + quite the way we did it, so redefine it: +*/ + +#if ( defined PARAGON || defined HPUX ) +#define complex complexx +#endif + +/* The T3E UNICOS standard library has cexp, clog, and csqrt, but they + are for double-precision complex, while ours are single-precision */ +#ifdef T3E +#define cexp cexp_single +#define clog clog_single +#define csqrt csqrt_single +#endif + +/* The above bites us actually now with C99, so redefine it all here: */ +#define complex complexx +#define clog clog_single +#define cexp cexp_single +#define csqrt csqrt_single + +typedef struct { /* standard complex number declaration for single- */ + radix real; /* precision complex numbers */ + radix imag; +} complex; +typedef struct { /* standard complex number declaration for double- */ + double real; /* precision complex numbers */ + double imag; +} double_complex; + +/* define complex as a union to ensure alignment to doubleword boundary */ +/*typedef union { ** standard complex number declaration for single- ** + radix f[2]; ** precision complex numbers ** + double dummy; +} complex; +typedef struct { ** standard complex number declaration for double- ** + double f[2]; ** precision complex numbers ** +} double_complex; */ +/*#define real f[0] */ +/*#define imag f[1] */ + + +/* Function Prototypes for Complex Numbers */ +complex cmplx( radix x, radix y ); +complex cadd( complex *a, complex *b ); +complex cmul( complex *a, complex *b ); +complex csub( complex *a, complex *b ); +complex cdiv( complex *a, complex *b ); +complex conjg( complex *a ); +complex cexp( complex *a ); +complex clog( complex *a ); +complex csqrt( complex *z ); +complex ce_itheta( radix theta ); + +double_complex dcmplx( double x, double y ); +double_complex dcadd( double_complex *a, double_complex *b ); +double_complex dcmul( double_complex *a, double_complex *b ); +double_complex dcsub( double_complex *a, double_complex *b ); +double_complex dcdiv( double_complex *a, double_complex *b ); +double_complex dconjg( double_complex *a ); +double_complex dcexp( double_complex *a ); +double_complex dclog( double_complex *a ); +double_complex dcsqrt( double_complex *z ); +double_complex dce_itheta( double theta ); + +/* Macros for Complex Numbers */ + +#define CZERO(a) (a).real = (a).imag = ((radix)0.0) + /* *b = *a */ +#define set_complex_equal(a,b) { (*b).real=(*a).real; (*b).imag=(*a).imag; } + +#define SET_COMPLEX(a,b,c) { (c).real = a; (c).imag = b; } + /* |*a| */ +#define cabs(a) (sqrt( (*a).real*(*a).real + (*a).imag*(*a).imag ) ) +#define CABS(a) (sqrt( (a).real * (a).real + (a).imag * (a).imag ) ) + /* *a * *a* */ +#define dcabs cabs +#define cabs_sq(a) ( (*a).real*(*a).real + (*a).imag*(*a).imag ) +#define CABS_SQ(a) ( (a).real * (a).real + (a).imag * (a).imag ) + /* phase(*a) */ +#define carg(a) (atan2((double)(*a).imag, (double)(*a).real ) ) +#define CARG(a) (atan2((double)(a).imag, (double)(a).real ) ) +/* real of conjg(a)*b */ +#define CRDOT(a,b) ((a).real * (b).real + (a).imag * (b).imag) + /* b = a* */ +#define dcarg carg +#define CONJG(a,b) { (b).real = (a).real; (b).imag = -(a).imag; } + /* c = a + b */ +#define CADD(a,b,c) { (c).real = (a).real + (b).real; \ + (c).imag = (a).imag + (b).imag; } + /* a += b */ +#define CSUM(a,b) { (a).real += (b).real; (a).imag += (b).imag; } + /* c = a - b */ +#define CSUB(a,b,c) { (c).real = (a).real - (b).real; \ + (c).imag = (a).imag - (b).imag; } + /* c = a * b */ +#define CMUL(a,b,c) { (c).real = (a).real*(b).real - (a).imag*(b).imag; \ + (c).imag = (a).real*(b).imag + (a).imag*(b).real; } + /* c = a / b */ +#define CMUL_SUM(a,b,c) { (c).real += (a).real*(b).real - (a).imag*(b).imag; \ + (c).imag += (a).real*(b).imag + (a).imag*(b).real; } +#define CMUL_ADD CMUL_SUM + /* c += a / b */ +#define CDIV(a,b,c) { double t_t = (b).real*(b).real + (b).imag*(b).imag; \ + (c).real = ((a).real*(b).real + (a).imag*(b).imag)/t_t; \ + (c).imag = ((a).imag*(b).real - (a).real*(b).imag)/t_t; } + +#define CINV(a,b) { double t_t = (a).real*(a).real + (a).imag*(a).imag; \ + (b).real = (a).real/t_t; (b).imag = -(a).imag/t_t; } + /* c = a * b* */ +#define CMUL_J(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \ + (c).imag = (a).imag*(b).real - (a).real*(b).imag; } + /* c = a * b* */ +#define CMUL_J_ADD(a,b,c) { (c).real += (a).real*(b).real + (a).imag*(b).imag; \ + (c).imag += (a).imag*(b).real - (a).real*(b).imag; } + /* c = a* * b */ +#define CMULJ_(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \ + (c).imag = (a).real*(b).imag - (a).imag*(b).real; } + /* c = a* * b */ +#define CMULJ__ADD(a,b,c) { (c).real += (a).real*(b).real + (a).imag*(b).imag; \ + (c).imag += (a).real*(b).imag - (a).imag*(b).real; } + /* c = (a*b)* */ +#define CMULJJ(a,b,c) { (c).real = (a).real*(b).real - (a).imag*(b).imag; \ + (c).imag = -(a).real*(b).imag - (a).imag*(b).real; } +#define CMULJJ_ADD(a,b,c) { (c).real += (a).real*(b).real - (a).imag*(b).imag; \ + (c).imag += -(a).real*(b).imag - (a).imag*(b).real; } + /* b = - a */ +#define CNEGATE(a,b) { (b).real = -(a).real; (b).imag = -(a).imag; } + /* b = ia */ +#define CMUL_I(a,b) { (b).real = -(a).imag; (b).imag = (a).real; } + /* b = -ia */ +#define CMUL_MINUS_I(a,b) { (b).real = (a).imag; (b).imag = -(a).real; } + /* c = ba */ +#define CMULREAL(a,b,c) { (c).real = (b) * (a).real; (c).imag = (b)*(a).imag; } + /* c = a/b */ +#define CDIVREAL(a,b,c) { (c).real = (a).real/(b); (c).imag = (a).imag/(b); } diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/complextr.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/complextr.c new file mode 100644 index 0000000000000000000000000000000000000000..0afc2915f4ec50b087612a358ea39e80077ec471 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/complextr.c @@ -0,0 +1,19 @@ +/****************** complextr.c (in su3.a) **************************** +* * +* complex complextrace_su3( su3_matrix *a,*b) * +* return Tr( A_adjoint*B ) * +*/ +#include "complex.h" +#include "su3.h" + +complex complextrace_su3( su3_matrix *a, su3_matrix *b ) { +register int i,j; +register radix sumr, sumi; +complex sum; + for(sumr=0.0,sumi=0.0,i=0;i<3;i++)for(j=0;j<3;j++){ + sumr+= a->e[i][j].real*b->e[i][j].real + a->e[i][j].imag*b->e[i][j].imag; + sumi+= a->e[i][j].real*b->e[i][j].imag - a->e[i][j].imag*b->e[i][j].real; + } + sum.real= sumr; sum.imag=sumi; + return(sum); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/conjg.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/conjg.c new file mode 100644 index 0000000000000000000000000000000000000000..a34198589be3d7badbdee98b30e504fefd1509e4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/conjg.c @@ -0,0 +1,10 @@ +/* Subroutines for operations on complex numbers */ +/* complex conjugate */ +#include "complex.h" + +complex conjg( complex *a ){ + complex c; + c.real = (*a).real; + c.imag = -(*a).imag; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/copy_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/copy_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..305eef040c4e6d6e7781ce9baaa9a7b8517757bd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/copy_wvec.c @@ -0,0 +1,12 @@ +/******************** copy_wvec.c (in su3.a) ******************** +* +*void copy_wvec( wilson_vector *src,*dest ) +* copy a Wilson vector +* dest <- src +*/ +#include "complex.h" +#include "su3.h" + +void copy_wvec( wilson_vector *src, wilson_vector *dest ){ + *dest = *src; /* hardly worth a function */ +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..0f93c396e8e4eb85c3bae24b0bbc7fe7f077e04b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_mat.c @@ -0,0 +1,38 @@ +/****************** cs_m_a_mat.c (in su3.a) *************************** +* * +* c_scalar_mult_add_su3mat( su3_matrix *ma, su3_matrix *m2, * +* complex *phase, su3_matrix *m3) * +* multiply an su3 matrix by a complex scalar and add it to another * +* matrix: m3 <- m1 + number*m2 * +*/ +#include "complex.h" +#include "su3.h" + +void c_scalar_mult_add_su3mat( su3_matrix *m1, su3_matrix *m2, + complex *phase, su3_matrix *m3){ + +#ifndef NATIVEDOUBLE +register int i,j; +complex t; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + t = cmul(&m2->e[i][j],phase); + m3->e[i][j] = cadd(&m1->e[i][j],&t); + } + +#else +register int i,j; +register double sr,si,br,bi,cr,ci; + + sr = (*phase).real; si = (*phase).imag; + + for(i=0;i<3;i++)for(j=0;j<3;j++){ + br=m2->e[i][j].real; bi=m2->e[i][j].imag; + + cr = sr*br - si*bi; + ci = sr*bi + si*br; + + m3->e[i][j].real = m1->e[i][j].real + cr; + m3->e[i][j].imag = m1->e[i][j].imag + ci; + } +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..d0e36aaa6827fd6096c96e2ea5d73951a4d8a4e5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_vec.c @@ -0,0 +1,35 @@ +/****************** cs_m_a_vec.c (in su3.a) *************************** +* * +* c_scalar_mult_add_su3vec(): * +* multiply an su3 vector by a complex scalar and add it to another * +* vector: v1 <- v1 + number*v2 * +*/ +#include "complex.h" +#include "su3.h" + +void c_scalar_mult_add_su3vec( su3_vector *v1, complex *phase, su3_vector *v2 ){ + +#ifndef NATIVEDOUBLE +register int i; +complex t; + for(i=0;i<3;i++){ + t = cmul(&v2->c[i],phase); + v1->c[i] = cadd(&v1->c[i],&t); + } +#else +register int i; +register double sr,si,br,bi,cr,ci; + + sr = (*phase).real; si = (*phase).imag; + + for(i=0;i<3;i++){ + br=v2->c[i].real; bi=v2->c[i].imag; + + cr = sr*br - si*bi; + ci = sr*bi + si*br; + + v1->c[i].real += cr; + v1->c[i].imag += ci; + } +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..ccc21d61b9e2503a7799e8999b3087e0c813fd87 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_wvec.c @@ -0,0 +1,42 @@ +/******************** cs_m_a_wvec.c (in su3.a) ******************** +* +*void c_scalar_mult_add_wvec(wilson_vector *src1, wilson_vector *src2, + complex *s, wilson_vector *dest) +* Multiply a Wilson vector by a complex scalar and add to another vector +* dest <- src1 + s*src2 +*/ +#include "complex.h" +#include "su3.h" + +void c_scalar_mult_add_wvec(wilson_vector *src1,wilson_vector *src2,complex + *phase, wilson_vector *dest) { + +#ifndef NATIVEDOUBLE +register int i,j; +complex t; + for(i=0;i<4;i++){ + for(j=0;j<3;j++){ + t = cmul(&src2->d[i].c[j],phase); + dest->d[i].c[j] = cadd(&src1->d[i].c[j],&t); + } + } + +#else +register int i,j; +register double sr,si,br,bi,cr,ci; + + sr = (*phase).real; si = (*phase).imag; + + for(i=0;i<4;i++){ + for(j=0;j<3;j++){ + br=src2->d[i].c[j].real; bi=src2->d[i].c[j].imag; + + cr = sr*br - si*bi; + ci = sr*bi + si*br; + + dest->d[i].c[j].real = src1->d[i].c[j].real + cr; + dest->d[i].c[j].imag = src1->d[i].c[j].imag + ci; + } + } +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_wvec2.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_wvec2.c new file mode 100644 index 0000000000000000000000000000000000000000..78f2f7f6eda6826cf604d557c6a6f59af1340ac4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_wvec2.c @@ -0,0 +1,27 @@ +/******************** cs_m_a_wvec2.c (in su3.a) ******************** +* +*void c_scalar_mult_add_wvec2(wilson_vector *src1, wilson_vector *src2, + complex s, wilson_vector *dest) +* Multiply a Wilson vector by a complex scalar and add to another vector +* dest <- src1 + s*src2 +*/ +#include "complex.h" +#include "su3.h" + +void c_scalar_mult_add_wvec2( wilson_vector *src1,wilson_vector *src2, + complex s, wilson_vector *dest ){ + wilson_vector src3; + register int i,j; + + scalar_mult_add_wvec( src1, src2, (s.real), dest ); + + for(i=0;i<4;i++) { + for(j=0;j<3;j++) { + src3.d[i].c[j].real = -(src2->d[i].c[j].imag); + src3.d[i].c[j].imag = src2->d[i].c[j].real; + } + } + + scalar_mult_add_wvec( dest, &src3, (s.imag), dest); + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..679a3aa0ce2147a5f46a7740425c350e28743534 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_mat.c @@ -0,0 +1,36 @@ +/**************** cs_m_mat.c (in su3.a) ******************************* +* * +* void c_scalar_mult_su3mat( su3_matrix *b, complex *s, su3_matrix *c) * +* C <- s*B, B and C matrices * +*/ +#include "complex.h" +#include "su3.h" + +/* c <- s*b, matrices */ +void c_scalar_mult_su3mat( su3_matrix *b, complex *s, su3_matrix *c ){ + +#ifndef NATIVEDOUBLE +register int i,j; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + c->e[i][j] = cmul(&b->e[i][j], s); + /* old: c->e[i][j].real = s.real*b->e[i][j].real-s.imag*b->e[i][j].imag; + c->e[i][j].imag = s.real*b->e[i][j].imag + s.imag*b->e[i][j].real; */ + } + +#else +register int i,j; +register double sr,si,br,bi,cr,ci; + + sr = (*s).real; si = (*s).imag; + + for(i=0;i<3;i++)for(j=0;j<3;j++){ + br=b->e[i][j].real; bi=b->e[i][j].imag; + + cr = sr*br - si*bi; + ci = sr*bi + si*br; + + c->e[i][j].real = cr; + c->e[i][j].imag = ci; + } +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_s_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_s_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..9c284e9cf796f39da79f0e7f571385a55a86c331 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_s_mat.c @@ -0,0 +1,38 @@ +/**************** cs_m_s_mat.c (in su3.a) ***************************** +* * +* void c_scalar_mult_sub_su3mat( su3_matrix *a, su3_matrix *b, * +* complex *s, su3_matrix *c) * +* C <- A - s*B, A,B and C matrices * +*/ +#include "complex.h" +#include "su3.h" + +/* c <- a - s*b, matrices */ +void c_scalar_mult_sub_su3mat( su3_matrix *a, su3_matrix *b, complex *s, + su3_matrix *c){ + +#ifndef NATIVEDOUBLE +register int i,j; +complex t; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + t = cmul(&b->e[i][j], s); + c->e[i][j] = csub(&a->e[i][j], &t); + } + +#else +register int i,j; +register double sr,si,br,bi,cr,ci; + + sr = (*s).real; si = (*s).imag; + + for(i=0;i<3;i++)for(j=0;j<3;j++){ + br=b->e[i][j].real; bi=b->e[i][j].imag; + + cr = sr*br - si*bi; + ci = sr*bi + si*br; + + c->e[i][j].real = a->e[i][j].real - cr; + c->e[i][j].imag = a->e[i][j].imag - ci; + } +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_s_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_s_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..1c77c41eec647f06c4c63f1eb83b7c8e8d61e6a9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_s_vec.c @@ -0,0 +1,35 @@ +/******************* cs_m_s_vec.c (in su3.a) ************************** +* * +* c_scalar_mult_sub_su3vec() * +* multiply an su3 vector by a complex scalar and subtract it from * +* another vector: v1 <- v1 - number*v2 * +*/ +#include "complex.h" +#include "su3.h" + +void c_scalar_mult_sub_su3vec( su3_vector *v1, complex *phase, su3_vector *v2 ){ + +#ifndef NATIVEDOUBLE +register int i; +complex t; + for(i=0;i<3;i++){ + t = cmul(&v2->c[i],phase); + v1->c[i] = csub(&v1->c[i],&t); + } +#else +register int i; +register double sr,si,br,bi,cr,ci; + + sr = (*phase).real; si = (*phase).imag; + + for(i=0;i<3;i++){ + br=v2->c[i].real; bi=v2->c[i].imag; + + cr = sr*br - si*bi; + ci = sr*bi + si*br; + + v1->c[i].real -= cr; + v1->c[i].imag -= ci; + } +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..03619ee7fe8f76cba3d94fa6c631eaffebe32c38 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_vec.c @@ -0,0 +1,34 @@ +/******************* cs_m_vec.c (in su3.a) **************************** +* * +* c_scalar_mult_su3vec(): * +* multiply an su3 vector by a complex scalar * +* dest <- number*src * +*/ +#include "complex.h" +#include "su3.h" + +void c_scalar_mult_su3vec( su3_vector *src, complex *phase, su3_vector *dest ){ + +#ifndef NATIVEDOUBLE +register int i; + for(i=0;i<3;i++){ + dest->c[i] = cmul(&src->c[i],phase); + } + +#else +register int i; +register double sr,si,br,bi,cr,ci; + + sr = (*phase).real; si = (*phase).imag; + + for(i=0;i<3;i++){ + br=src->c[i].real; bi=src->c[i].imag; + + cr = sr*br - si*bi; + ci = sr*bi + si*br; + + dest->c[i].real = cr; + dest->c[i].imag = ci; + } +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/csqrt.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/csqrt.c new file mode 100644 index 0000000000000000000000000000000000000000..941ce8caab658d8dd086cbf1d93dac47f67792a3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/csqrt.c @@ -0,0 +1,14 @@ +/* Subroutines for operations on complex numbers */ +/* complex square root */ +#include +#include "complex.h" + +complex csqrt( complex *z ){ +complex c; +radix theta,r; + r = sqrt(hypot(z->real,z->imag)); + theta = 0.5*atan2(z->imag,z->real); + c = ce_itheta(theta); + c.real *=r; c.imag *= r; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/csub.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/csub.c new file mode 100644 index 0000000000000000000000000000000000000000..cc6ada989dd1aa2c0a43fe83d21777ef2611343e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/csub.c @@ -0,0 +1,10 @@ +/* Subroutines for operations on complex numbers */ +/* complex subtract */ +#include "complex.h" + +complex csub( complex *a, complex *b ) { + complex c; + c.real = (*a).real - (*b).real; + c.imag = (*a).imag - (*b).imag; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcadd.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcadd.c new file mode 100644 index 0000000000000000000000000000000000000000..fb10793d5d22b559d9f77206b1f8b96ec4d00ba8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcadd.c @@ -0,0 +1,10 @@ +/* Subroutines for operations on complex numbers */ +/* double complex add */ +#include "complex.h" + +double_complex dcadd( double_complex *a, double_complex *b ){ + double_complex c; + c.real = (*a).real + (*b).real; + c.imag = (*a).imag + (*b).imag; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcdiv.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcdiv.c new file mode 100644 index 0000000000000000000000000000000000000000..4157e474c2e69f666e2db7491240d081deacb50c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcdiv.c @@ -0,0 +1,12 @@ +/* Subroutines for operations on complex numbers */ +/* double complex divide */ +#include "complex.h" + +double_complex dcdiv( double_complex *a, double_complex *b ){ + double_complex c; + double scale; + scale = 1.0/((*b).real*(*b).real+(*b).imag*(*b).imag); + c.real = scale*((*a).real*(*b).real + (*a).imag*(*b).imag); + c.imag = scale*((*a).imag*(*b).real - (*a).real*(*b).imag); + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dce_itheta.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dce_itheta.c new file mode 100644 index 0000000000000000000000000000000000000000..c7cae5f8e93780d44030c2a7cf7ca1280ffb6883 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dce_itheta.c @@ -0,0 +1,12 @@ +/* Subroutines for operations on complex numbers */ +/* double complex exp( i*theta ) */ +#include +#include "complex.h" + +double_complex dce_itheta( double theta ){ + double_complex c; + c.real = (double)cos( (double)theta ); + c.imag = (double)sin( (double)theta ); + /* there must be a more efficient way */ + return( c ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcexp.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcexp.c new file mode 100644 index 0000000000000000000000000000000000000000..c36539a844f6d1bb893d1cd1a437ba6e93ae121c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcexp.c @@ -0,0 +1,13 @@ +/* Subroutines for operations on complex numbers */ +/* double complex exponential */ +#include +#include "complex.h" + +double_complex dcexp( double_complex *a ){ + double_complex c; + double mag; + mag = (double)exp( (double)(*a).real ); + c.real = mag*(double)cos( (double)(*a).imag ); + c.imag = mag*(double)sin( (double)(*a).imag ); + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dclog.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dclog.c new file mode 100644 index 0000000000000000000000000000000000000000..2349b418a32648c200445a510489eb5501c6aaf9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dclog.c @@ -0,0 +1,11 @@ +/* Subroutines for operations on complex numbers */ +/* double complex logarithm */ +#include +#include "complex.h" + +double_complex dclog( double_complex *a ){ + double_complex c; + c.real = 0.5*(double)log((double)((*a).real*(*a).real+(*a).imag*(*a).imag)); + c.imag = (double)atan2( (double)(*a).imag, (double)(*a).real ); + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcmplx.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcmplx.c new file mode 100644 index 0000000000000000000000000000000000000000..45a5e1468b67c434a3f0db91d9d6b9988944e06e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcmplx.c @@ -0,0 +1,9 @@ +/* Subroutines for operations on complex numbers */ +/* make a double complex number from two double precision reals */ +#include "complex.h" + +double_complex dcmplx( double x, double y ){ + double_complex c; + c.real = x; c.imag = y; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcmul.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcmul.c new file mode 100644 index 0000000000000000000000000000000000000000..8ef680fd43aba2bf9e7b68a4ff132bc76fbc203c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcmul.c @@ -0,0 +1,10 @@ +/* Subroutines for operations on complex numbers */ +/* double complex multiply */ +#include "complex.h" + +double_complex dcmul( double_complex *a, double_complex *b ){ + double_complex c; + c.real = (*a).real * (*b).real - (*a).imag * (*b).imag; + c.imag = (*a).imag * (*b).real + (*a).real * (*b).imag; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dconjg.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dconjg.c new file mode 100644 index 0000000000000000000000000000000000000000..62efb9c4f75205a2afe9289e84c478c0308513ca --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dconjg.c @@ -0,0 +1,10 @@ +/* Subroutines for operations on complex numbers */ +/* double precision complex conjugate */ +#include "complex.h" + +double_complex dconjg( double_complex *a ){ + double_complex c; + c.real = (*a).real; + c.imag = -(*a).imag; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcsqrt.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcsqrt.c new file mode 100644 index 0000000000000000000000000000000000000000..6a1db45ea1b5707cef4436cff57d71c32539f201 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcsqrt.c @@ -0,0 +1,15 @@ +/* Subroutines for operations on complex numbers */ +/* double precision complex square root */ +#include +#include "complex.h" + +double_complex dcsqrt( double_complex *z ){ +double_complex c; +double theta,r; + r = sqrt(hypot(z->real,z->imag)); + theta = 0.5*atan2(z->imag,z->real); + c = dce_itheta(theta); + c.real *=r; c.imag *= r; + return(c); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcsub.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcsub.c new file mode 100644 index 0000000000000000000000000000000000000000..989254f8ea7f5b13bf52809eeabf80e5caf7fa34 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcsub.c @@ -0,0 +1,10 @@ +/* Subroutines for operations on complex numbers */ +/* double complex subtract */ +#include "complex.h" + +double_complex dcsub( double_complex *a, double_complex *b ){ + double_complex c; + c.real = (*a).real - (*b).real; + c.imag = (*a).imag - (*b).imag; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/det_su3.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/det_su3.c new file mode 100644 index 0000000000000000000000000000000000000000..13c273c7c883b2640e8afa8425e7a0657c682809 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/det_su3.c @@ -0,0 +1,31 @@ +/****************** det_su3.c (in su3.a) ****************************** +* * +* complex det_su3( su3_matrix *a ) * +* Complex determinant of an SU3 matrix * +*/ +#include "complex.h" +#include "su3.h" + +/* FIX THIS - more efficient to take cross product of first two + rows, dot with third. */ +complex det_su3( su3_matrix *a ) { +register complex cc,dd,sum; + CMUL(a->e[0][0],a->e[1][1],cc); + CMUL(cc,a->e[2][2],sum); + CMUL(a->e[0][0],a->e[1][2],cc); + CMUL(cc,a->e[2][1],dd); + CSUB(sum,dd,sum); + CMUL(a->e[0][1],a->e[1][2],cc); + CMUL(cc,a->e[2][0],dd); + CADD(sum,dd,sum); + CMUL(a->e[0][1],a->e[1][0],cc); + CMUL(cc,a->e[2][2],dd); + CSUB(sum,dd,sum); + CMUL(a->e[0][2],a->e[1][0],cc); + CMUL(cc,a->e[2][1],dd); + CADD(sum,dd,sum); + CMUL(a->e[0][2],a->e[1][1],cc); + CMUL(cc,a->e[2][0],dd); + CSUB(sum,dd,sum); + return(sum); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dump_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dump_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..9c6e8ca5ba2b5ca7f327f3160d692deb483d62fd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dump_wvec.c @@ -0,0 +1,18 @@ +/**************** dump_wvec.c (in su3.a) *********************** +* * +* void dump_wvec( wilson_vector *v ) * +* Print out a Wilson vector * +*/ +#include +#include "complex.h" +#include "su3.h" + +void dump_wvec( wilson_vector *v ){ +register int i,j; + for(i=0;i<4;i++){ + for(j=0;j<3;j++)printf("(%.2e,%.2e)\t", + v->d[i].c[j].real,v->d[i].c[j].imag); + printf("\n"); + } + printf("\n"); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dumpmat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dumpmat.c new file mode 100644 index 0000000000000000000000000000000000000000..248a323418a72b3de7d8c3960b1d13e4c66422b9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dumpmat.c @@ -0,0 +1,18 @@ +/****************** dumpmat.c (in su3.a) ****************************** +* * +* void dumpmat( su3_matrix *mat ) * +* print out a 3x3 complex matrix * +*/ +#include +#include "complex.h" +#include "su3.h" + +void dumpmat( su3_matrix *m ){ +int i,j; + for(i=0;i<3;i++){ + for(j=0;j<3;j++)printf("(%.2e,%.2e)\t", + m->e[i][j].real,m->e[i][j].imag); + printf("\n"); + } + printf("\n"); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dumpvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dumpvec.c new file mode 100644 index 0000000000000000000000000000000000000000..7a34ec3190796a2f80f8dd8f0cef2b992c41d445 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dumpvec.c @@ -0,0 +1,15 @@ +/******************* dumpvec.c (in su3.a) ***************************** +* * +* void dumpvec( su3_vector *vec ) * +* print out a 3 element complex vector * +*/ +#include +#include "complex.h" +#include "su3.h" + +void dumpvec( su3_vector *v ){ +int j; + for(j=0;j<3;j++)printf("(%.2e,%.2e)\t", + v->c[j].real,v->c[j].imag); + printf("\n"); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/flush_to_zero.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/flush_to_zero.c new file mode 100644 index 0000000000000000000000000000000000000000..bac931918ee2b5b38fbb3d61b8801094a16784d4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/flush_to_zero.c @@ -0,0 +1,4 @@ +/** flush_to_zero.c ***/ + +/* DUMMY ROUTINE - nothing to do unless on Intel machine */ +void flush_to_zero(){} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/gaussrand.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/gaussrand.c new file mode 100644 index 0000000000000000000000000000000000000000..d58b790b5683961073a3fd360351d2c68f06fbb5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/gaussrand.c @@ -0,0 +1,35 @@ +/***************** gaussrand.c (in su3.a) ***************************** +* * +* radix gaussian_ran_no( passthru *prn_pt ) * +* Gaussian distributed random number * +* Probability distribution exp( -x*x ), so < x^2 > = 1/2 * +* This requires a random number generator named "myrand()", returning * +* a radix uniformly distributed between zero and one. The argument of * +* this routine is a pointer to be passed to myrand(). * +*/ + +#include +#include "complex.h" +#include "su3.h" + +radix gaussian_rand_no( void *prn_pt ){ +radix myrand(); +static int iset=0; +static radix gset; +radix fac,r,v1,v2; + + if (iset == 0) { + do { + v1=2.0*myrand(prn_pt)-1.0; + v2=2.0*myrand(prn_pt)-1.0; + r=v1*v1+v2*v2; + } while (r >= 1.0); + fac=sqrt( -log((double)r)/(double)r); + gset=v1*fac; + iset=1; + return v2*fac; + } else { + iset=0; + return gset; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_clover.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_clover.h new file mode 100644 index 0000000000000000000000000000000000000000..50647075165c9e524c67a72b1a36ff1f9dd688b0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_clover.h @@ -0,0 +1,87 @@ +/************************ generic_clover.h ****************************** +* * +* Macros and declarations for generic_clover routines * +* This header is for codes that call generic_clover routines * +* MIMD version 5 * +* * +*/ + + +int clover_invert( /* Return value is number of iterations taken */ + field_offset src, /* type wilson_vector (source already created)*/ + field_offset dest, /* type wilson_vector (answer and initial guess) */ + field_offset tmp, /* type wilson_vector (workspace used only for bi-cg)*/ + field_offset sav, /* type wilson_vector (for saving source) */ + int MinCG, /* minimum number of iterations per restart */ + int MaxCG, /* maximum number of iterations per restart */ + int nrestart, /* maximum restarts */ + radix RsdCG, /* desired residual - + normalized as sqrt(r*r)/sqrt(src_e*src_e */ + radix *size_r, /* resulting residual */ + int start_flag, /* 0: use a zero initial guess; 1: use dest */ + radix Kappa, /* hopping */ + radix Clov_c, /* Perturbative clover coeff */ + radix U0, /* Tadpole correction to Clov_c */ + field_offset f_mn /* size of su3_matrix (workspace) */ + ); + +int clover_invert_lean( /* Return value is number of iterations taken */ + field_offset src, /* type wilson_vector (where source is to be created)*/ + field_offset dest, /* type wilson_vector (answer and initial guess) */ + field_offset tmp, /* type wilson_vector (workspace used only for bi-cg)*/ + void (*source_func)(field_offset src, + wilson_quark_source *wqs), /* source function */ + wilson_quark_source *wqs, /* source parameters */ + int MinCG, /* minimum number of iterations per restart */ + int MaxCG, /* maximum number of iterations per restart */ + int nrestart, /* maximum restarts */ + radix RsdCG, /* desired residual - + normalized as sqrt(r*r)/sqrt(src_e*src_e */ + radix *size_r, /* resulting residual */ + int start_flag, /* 0: use a zero initial guess; 1: use dest */ + radix Kappa, /* hopping */ + radix Clov_c, /* Perturbative clover coeff */ + radix U0, /* Tadpole correction to Clov_c */ + field_offset f_mn /* size of su3_matrix (workspace) */ + ); + +int cgilu_cl( /* Return value is number of iterations taken */ + field_offset src, /* type wilson_vector (source vector - OVERWRITTEN!)*/ + field_offset dest, /* type wilson_vector (answer and initial guess )*/ + int MinCG, /* minimum number of iterations per restart */ + int MaxCG, /* maximum number of iterations */ + radix RsdCG, /* desired residual - + normalized as sqrt(r*r)/sqrt(src_e*src_e */ + radix *size_r, /* resulting residual */ + int flag, /* 0: use a zero initial guess; 1: use dest */ + radix Kappa, /* hopping */ + radix Clov_c, /* Perturbative clover coeff */ + radix U0, /* Tadpole correction to Clov_c */ + field_offset f_mn /* Scratch space of size su3_matrix */ + ); +int bicgilu_cl( /* Return value is number of iterations taken */ + field_offset src, /* type wilson_vector (source vector - OVERWRITTEN!)*/ + field_offset dest, /* type wilson_vector (answer and initial guess )*/ + int MinCG, /* minimum number of iterations per restart */ + int MaxCG, /* maximum number of iterations */ + radix RsdCG, /* desired residual - + normalized as sqrt(r*r)/sqrt(src_e*src_e */ + radix *size_r, /* resulting residual */ + int flag, /* 0: use a zero initial guess; 1: use dest */ + radix Kappa, /* hopping */ + field_offset rv, /* Scratch space of size wilson_vector */ + radix Clov_c, /* Perturbative clover coeff */ + radix U0, /* Tadpole correction to Clov_c */ + field_offset f_mn /* Scratch space of size su3_matrix */ + ); +void f_mu_nu(field_offset f_mn,int mu,int nu); +void make_clov(radix Clov_c,field_offset f_mn); +void make_clovinv(); + +void mult_ldu( + field_offset src, /* type wilson_vector RECAST AS wilson_block_vector */ + field_offset dest, /* type wilson_vector RECAST AS wilson_block_vector */ + field_offset triang,/* type triangular */ + field_offset diag, /* type diagonal */ + int parity ); + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_form.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_form.h new file mode 100644 index 0000000000000000000000000000000000000000..3cdc33517fd99432d09d9ef72b501f74109aab2b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_form.h @@ -0,0 +1,31 @@ +/************************ generic_form.h ************************************* +* * +* Macros and declarations for miscellaneous generic routines * +* This header is for codes that call generic_form routines * +* MIMD version 5 * +* * +*/ + +void c_scale_wilson_vector(wilson_vector *m , complex scale); +void copy_site_wilson_vector(field_offset src, field_offset dest) ; +void flip_source_re(field_offset quark_prop); +int load_momentum_from_disk(int mom_in[][3], char filename[], int max_mom); +void load_scalar_smear(radix *data, int dim, char filename[]); +void load_smearing(field_offset where_smear, char filename[80]); +void mult_gamma(int phase, gamma_matrix *g1, gamma_matrix *g2, gamma_matrix *g3); +void make_gammas(gamma_matrix *gamma); +void mult_sw_by_gamma_l(spin_wilson_vector * src, + spin_wilson_vector * dest, int dir); +void mult_sw_by_gamma_r(spin_wilson_vector * src, + spin_wilson_vector * dest, int dir); +void meson_cont_mom(complex prop[], + field_offset src1,field_offset src2, + int base_pt, int q_stride, int op_stride, + gamma_corr gamma_table[], int no_gamma_corr); +void load_wilson_source(field_offset src, field_offset dest,int color,int spin); + +void load_wvec(wilson_vector *dest, complex *z, int spin, int colour) ; + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_ks.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_ks.h new file mode 100644 index 0000000000000000000000000000000000000000..b2069e359de5fc89b57ea62f78f57db47d53a0f1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_ks.h @@ -0,0 +1,42 @@ +/************************ generic_ks.h ********************************** +* * +* Macros and declarations for generic_ks routines * +* This header is for codes that call generic_ks routines * +* MIMD version 5 * +* * +*/ + +int congrad( int niter, radix rsqmin, int parity, radix *rsq ); +void copy_latvec(field_offset src, field_offset dest, int parity); +void dslash( field_offset src, field_offset dest, int parity ); +void dslash_special( field_offset src, field_offset dest, + int parity, msg_tag **tag, int start ); +void clear_latvec(field_offset v,int parity); + +void scalar_mult_latvec(field_offset src, radix scalar, + field_offset dest, int parity); +void scalar_mult_add_latvec(field_offset src1, field_offset src2, + radix scalar, field_offset dest, int parity); +void grsource(int parity); +void checkmul(); +int spectrum(); +void make_lattice(); +void phaseset(); +void rephase( int flag ); + +void prefetch_vector( su3_vector * ); +void prefetch_matrix( su3_matrix * ); +void V_sma_and_rdot( su3_vector * ttt_pt, su3_vector * cg_p_pt, radix x, + double * pkp_pt, int nsites, int stride ); +void V_sma2_and_mag( su3_vector * xxx_pt, su3_vector * cg_p_pt, + su3_vector * resid_pt, su3_vector * ttt_pt, radix a, + double * rsq_pt, int nsites, int stride ); +void V_sma_vec( su3_vector * src1, su3_vector * src2, + radix scalar, su3_vector * dest, int nsites, int stride ); +void V_mult_adj_su3_mat_vec_4dir( su3_matrix * lpt, + su3_vector * srcpt, su3_vector * destpt, int nsites, int stride ); +void V_mult_su3_mat_vec_sum_4dir( su3_matrix * lpt, + su3_vector ** xpt, su3_vector ** ypt, su3_vector ** zpt, su3_vector ** tpt, + su3_vector * destpt, int nsites, int stride ); + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_notused.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_notused.h new file mode 100644 index 0000000000000000000000000000000000000000..bc0be55afdbf1f1a2961b1ef914a1bdd6ed0691e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_notused.h @@ -0,0 +1,80 @@ +/************************ generic.h ************************************* +* * +* Macros and declarations for miscellaneous generic routines * +* This header is for codes that call generic routines * +* MIMD version 5 * +* * +*/ + +/* Other generic directory declarations are elsewhere: + + For com_*.c, see comdefs.h + For io_ansi.c, io_nonansi.c, io_piofs.c, io_paragon.c see io_lat.h + For io_wb.c, see io_wb.h +*/ + +/* bsd sum */ +#ifndef _type32 +#define _type32 +#ifdef SHORT32 +typedef unsigned short type32; +#else +typedef unsigned int type32; +#endif +#endif +type32 bsd_sum (char *data,type32 total_bytes); + +/* check_unitarity.c */ +void check_unitarity( void ); + +/* Routines in layout_*.c */ +void setup_layout( void ); +int node_number(int x,int y,int z,int t); +int node_index(int x,int y,int z,int t); +int num_sites(int node); + +/* ploop?.c */ +complex ploop( void ); + +/* d_plaq?.c */ +void d_plaquette(double *ss_plaq,double *st_plaq); + +/* plaquette_generic.c */ +void plaquette_generic(radix *ss_plaq,radix *st_plaq,field_offset su3_mat); + +/* plaquette4.c */ +void plaquette(radix *ss_plaq,radix *st_plaq); + +/* ploop_staple.c */ +complex ploop_staple(radix alpha_fuzz); + +/* ranstuff.c */ +void initialize_prn(double_prn *prn_pt, int seed, int index); +radix myrand(double_prn *prn_pt); + +/* ranmom.c */ +void ranmom(); + +/* restrict_fourier.c */ +void setup_restrict_fourier( int *key, int *restrict); +void restrict_fourier( + field_offset src, /* src is field to be transformed */ + field_offset space, /* space is working space, same size as src */ + field_offset space2,/* space2 is working space, same size as src */ + /* space2 is needed only for non power of 2 */ + int size, /* Size of field in bytes. The field must + consist of size/sizeof(complex) consecutive + complex numbers. For example, an su3_vector + is 3 complex numbers. */ + int isign); /* 1 for x -> k, -1 for k -> x */ + +/* gaugefix.c */ +void gaugefix(int gauge_dir,radix relax_boost,int max_gauge_iter, + radix gauge_fix_tol, field_offset diffmat, field_offset sumvec, + int nvector, field_offset vector_offset[], int vector_parity[], + int nantiherm, field_offset antiherm_offset[], + int antiherm_parity[] ); + +/* reunitarize.c */ +void reunitarize( void ); + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_wilson.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_wilson.h new file mode 100644 index 0000000000000000000000000000000000000000..cba3be6be66466750dc9cff959c3753caa4425cb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_wilson.h @@ -0,0 +1,95 @@ +/************************ generic_wilson.h ****************************** +* * +* Macros and declarations for generic_wilson routines * +* This header is for codes that call generic_wilson routines * +* MIMD version 5 * +* * +*/ + +/* For various inversion routines. Not used sytematically yet. CD */ +enum guess_params { START_ZERO_GUESS = 0 , START_NONZERO_GUESS } ; + +int wilson_invert( /* Return value is number of iterations taken */ + field_offset src, /* type wilson_vector (source already created)*/ + field_offset dest, /* type wilson_vector (answer and initial guess) */ + field_offset tmp, /* type wilson_vector (workspace used only for bi-cg)*/ + field_offset sav, /* type wilson_vector (for saving source) */ + int MinCG, /* minimum number of iterations per restart */ + int MaxCG, /* maximum number of iterations per restart */ + int nrestart, /* maximum restarts */ + radix RsdCG, /* desired residual - + normalized as sqrt(r*r)/sqrt(src_e*src_e */ + radix *size_r, /* resulting residual */ + int start_flag, /* 0: use a zero initial guess; 1: use dest */ + radix Kappa /* hopping */ + ); + +int wilson_invert_lean( /* Return value is number of iterations taken */ + field_offset src, /* type wilson_vector (where source is to be created)*/ + field_offset dest, /* type wilson_vector (answer and initial guess) */ + field_offset tmp, /* type wilson_vector (workspace used only for bi-cg)*/ + void (*source_func)(field_offset src, + wilson_quark_source *wqs), /* source function */ + wilson_quark_source *wqs, /* source parameters */ + int MinCG, /* minimum number of iterations per restart */ + int MaxCG, /* maximum number of iterations per restart */ + int nrestart, /* maximum restarts */ + radix RsdCG, /* desired residual - + normalized as sqrt(r*r)/sqrt(src_e*src_e */ + radix *size_r, /* resulting residual */ + int start_flag, /* 0: use a zero initial guess; 1: use dest */ + radix Kappa /* hopping */ + ); + + +int congrad(int niter,radix rsqmin,radix *final_rsq_ptr); + +void copy_site_wilson_vector(field_offset src, field_offset dest); + +int cgilu_w( /* Return value is number of iterations taken */ + field_offset src, /* type wilson_vector (source vector - OVERWRITTEN!)*/ + field_offset dest, /* type wilson_vector (answer and initial guess )*/ + int MinCG, /* minimum number of iterations */ + int MaxCG, /* maximum number of iterations */ + radix RsdCG, /* desired residual - + normalized as sqrt(r*r)/sqrt(src_e*src_e */ + radix *size_r, /* resulting residual */ + int flag, /* 0: use a zero initial guess; 1: use dest */ + radix Kappa /* hopping */ + ); +int bicgilu_w( /* Return value is number of iterations taken */ + field_offset src, /* type wilson_vector (source vector - OVERWRITTEN!)*/ + field_offset dest, /* type wilson_vector (answer and initial guess )*/ + int MinCG, /* minimum number of iterations */ + int MaxCG, /* maximum number of iterations */ + radix RsdCG, /* desired residual - + normalized as sqrt(r*r)/sqrt(src_e*src_e */ + radix *size_r, /* resulting residual */ + int flag, /* 0: use a zero initial guess; 1: use dest */ + radix Kappa, /* hopping */ + field_offset rv /* Scratch space of size wilson_vector */ + ); +int mrilu_w_or(field_offset src,field_offset dest,int MinMR,int MaxMR,radix RsdMR, + radix *size_r,int flag,radix Kappa); + +/* For quark source routines */ +/* The Weyl representation types are included for w_source_h */ +enum source_type { + POINT = 1, GAUSSIAN, CUTOFF_GAUSSIAN, + POINT_WEYL, CUTOFF_GAUSSIAN_WEYL } ; +void w_source(field_offset src,wilson_quark_source *wqs); +void w_source_h(field_offset src,wilson_quark_source *wqs); +radix *make_template(radix gamma, int cutoff); +void w_sink(field_offset snk,wilson_quark_source *wqs); +int ask_quark_source( int prompt, int *type, char *descrp ); + +void bj_to_weyl( wilson_vector *src, wilson_vector *dest); +void dslash(field_offset src,field_offset dest, + int isign,int parity); +void dslash_special(field_offset src,field_offset dest, + int isign,int parity,msg_tag **tag,int is_started); +void w_meson(field_offset src1,field_offset src2,complex *prop[10]); +void w_baryon(field_offset src1,field_offset src2,field_offset src3, + complex *prop[4]); +void w_baryon_hl(field_offset src1,field_offset src2, + field_offset src3, complex *prop[6]); diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/grow4wvecs.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/grow4wvecs.c new file mode 100644 index 0000000000000000000000000000000000000000..f2c8856e52deae251a7906364b35bae762a6ddbd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/grow4wvecs.c @@ -0,0 +1,178 @@ +/***************** grow4wvecs.c (in su3.a) **************************** +* * +* If sum=0, * +* Grow and add four wilson_vectors * +* If sum=1, * +* Grow and sum four wilson_vectors to another wilson_vector * +* void grow_four_wvecs(a,b1,b2,b3,b4,sign,sum) * +* wilson_vector *a; half_wilson_vector *b1,*b2,*b3,*b4; * +* int sign,sum; * +* A <- B1 + B2 + B3 + B4 or * +* A <- A + B1 + B2 + B3 + B4 * +* B1 is expanded using gamma_x, B2 using gamma_y, etc. * +*/ +#include "complex.h" +#include "su3.h" +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 7 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define TUP 3 +#define TDOWN 4 +#define ZDOWN 5 +#define YDOWN 6 +#define XDOWN 7 + +#define OPP_DIR(dir) (7-(dir)) /* Opposite direction */ +#define NDIRS 8 /* number of directions */ + +/* grow and sum four wilson_vectors */ + +#ifndef FAST + +void grow_add_four_wvecs( wilson_vector *a, half_wilson_vector *b1, + half_wilson_vector *b2, half_wilson_vector *b3, + half_wilson_vector *b4, int sign, int sum ){ + if(sum==0)wp_grow( b1,a,XUP,sign); + else wp_grow_add( b1,a,XUP,sign); + wp_grow_add( b2,a,YUP,sign); + wp_grow_add( b3,a,ZUP,sign); + wp_grow_add( b4,a,TUP,sign); +} + +#else /* "FAST" code has wp_grow_add inlined */ +/* For the RS6000 */ + +/* a += i*b, a += -i*b */ +#define CSUM_TPI(a,b) { (a).real -= (b).imag; (a).imag += (b).real; } +#define CSUM_TMI(a,b) { (a).real += (b).imag; (a).imag -= (b).real; } + +void grow_add_four_wvecs( wilson_vector *a, half_wilson_vector *b1, + half_wilson_vector *b2, half_wilson_vector *b3, + half_wilson_vector *b4, int sign, int sum ){ + int i; + if(sum==0) + { + /* wp_grow( b1,a,XUP,sign); */ + + /* case XUP: */ + if(sign==PLUS) + { + for(i=0;i<3;i++){ + a->d[0].c[i] = b1->h[0].c[i]; + a->d[1].c[i] = b1->h[1].c[i]; + TIMESMINUSI( b1->h[0].c[i], a->d[3].c[i]); + TIMESMINUSI( b1->h[1].c[i], a->d[2].c[i]); + } + } + else + { + /* case XDOWN: */ + for(i=0;i<3;i++){ + a->d[0].c[i] = b1->h[0].c[i]; + a->d[1].c[i] = b1->h[1].c[i]; + TIMESPLUSI( b1->h[0].c[i], a->d[3].c[i]); + TIMESPLUSI( b1->h[1].c[i], a->d[2].c[i]); + } + } + } + else + { + /*wp_grow_add( b1,a,XUP,sign); */ + + /* case XUP: */ + if(sign==PLUS) + { + for(i=0;i<3;i++){ + CSUM( a->d[0].c[i], b1->h[0].c[i]); + CSUM( a->d[1].c[i], b1->h[1].c[i]); + CSUM_TMI( a->d[2].c[i], b1->h[1].c[i] ); + CSUM_TMI( a->d[3].c[i], b1->h[0].c[i] ); + } + } + else + { + /* case XDOWN: */ + for(i=0;i<3;i++){ + CSUM( a->d[0].c[i], b1->h[0].c[i]); + CSUM( a->d[1].c[i], b1->h[1].c[i]); + CSUM_TPI( a->d[2].c[i], b1->h[1].c[i] ); + CSUM_TPI( a->d[3].c[i], b1->h[0].c[i] ); + } + } + } + + /* wp_grow_add( b2,a,YUP,sign); */ + + if(sign==PLUS) + { + /* case YUP: */ + for(i=0;i<3;i++){ + CSUM( a->d[0].c[i], b2->h[0].c[i]); + CSUM( a->d[1].c[i], b2->h[1].c[i]); + CSUM( a->d[2].c[i], b2->h[1].c[i]); + CSUB( a->d[3].c[i], b2->h[0].c[i], a->d[3].c[i] ); + } + } + else + { + /* case YDOWN: */ + for(i=0;i<3;i++){ + CSUM( a->d[0].c[i], b2->h[0].c[i]); + CSUM( a->d[1].c[i], b2->h[1].c[i]); + CSUB( a->d[2].c[i], b2->h[1].c[i], a->d[2].c[i] ); + CSUM( a->d[3].c[i], b2->h[0].c[i]); + } + } + + /* wp_grow_add( b3,a,ZUP,sign); */ + + if(sign==PLUS) + { + /* case ZUP: */ + for(i=0;i<3;i++){ + CSUM( a->d[0].c[i], b3->h[0].c[i]); + CSUM( a->d[1].c[i], b3->h[1].c[i]); + CSUM_TMI( a->d[2].c[i], b3->h[0].c[i] ); + CSUM_TPI( a->d[3].c[i], b3->h[1].c[i] ); + } + } + else + { + /* case ZDOWN:*/ + for(i=0;i<3;i++){ + CSUM( a->d[0].c[i], b3->h[0].c[i]); + CSUM( a->d[1].c[i], b3->h[1].c[i]); + CSUM_TPI( a->d[2].c[i], b3->h[0].c[i] ); + CSUM_TMI( a->d[3].c[i], b3->h[1].c[i] ); + } + } + + /* wp_grow_add( b4,a,TUP,sign); */ + + if(sign==PLUS) + { + /* case TUP: */ + for(i=0;i<3;i++){ + CSUM( a->d[0].c[i], b4->h[0].c[i]); + CSUM( a->d[1].c[i], b4->h[1].c[i]); + CSUM( a->d[2].c[i], b4->h[0].c[i]); + CSUM( a->d[3].c[i], b4->h[1].c[i]); + } + } + else + { + /* case TDOWN: */ + for(i=0;i<3;i++){ + CSUM( a->d[0].c[i], b4->h[0].c[i]); + CSUM( a->d[1].c[i], b4->h[1].c[i]); + CSUB( a->d[2].c[i], b4->h[0].c[i], a->d[2].c[i] ); + CSUB( a->d[3].c[i], b4->h[1].c[i], a->d[3].c[i] ); + } + } +} + +#endif /* "#ifndef FAST"*/ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/io_wb.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/io_wb.h new file mode 100644 index 0000000000000000000000000000000000000000..671874691d9bb42ac83de171e492111dc3a6ea72 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/io_wb.h @@ -0,0 +1,313 @@ +/************************ io_wb.h ************************************* +/* This header file defines the binary file format for the propagator file and + defines structures for file descriptors that include the file header information */ + +/* Define a 32-bit integer type. Machine-dependent. Add more as needed. */ +/* + Original by CD + 2/26/98 Changed type32 to signed CD + */ + +#ifndef _type32 +#define _type32 +#ifdef SHORT32 +typedef short type32; +#else +typedef int type32; +#endif +#endif + +#ifdef CONTROL +#define EXTERN +#else +#define EXTERN extern +#endif + +/**********************************************************************/ +/* Binary lattice formats */ +/**********************************************************************/ +/* In version 5 we have two binary lattice file formats: + serial files Data written in coordinate natural order + checkpoint files Data written in node dump order + + Further descriptive information is kept in a separate ASCII header + file. See below. + + */ + + +/*--------------------------------------------------------------------*/ +/* version 5 binary file format */ + +#define W_PROP_VERSION_NUMBER 12781 +#define MAX_TIME_STAMP 64 +#define MAX_SOURCE_SPINS 4 + +/* Begin definition of header stuctures */ + +/* Note that an effort is made to make all radixing point and integer + fields 32 bits long. However, byte ordering may vary across + platforms, and no effort is made in writing the file to produce + a standard byte order. The input routines attempt to compensate + for byte reversal automatically, by examining the magic number at + the beginning of the file */ + +/* 1. Header comes first */ + +typedef struct { + type32 magic_number; /* Identifies file format */ + char time_stamp[MAX_TIME_STAMP]; /* Date and time stamp - used to + check consistency between the + ASCII header file and the + lattice file */ + type32 dims[4]; /* Full lattice dimensions */ + type32 header_bytes; /* NOT WRITTEN TO THE FILE but + helpful for finding the data */ + + type32 order; /* 0 file is in natural order + no coordinate list is attached. + 1 file is in node-dump (checkpoint) + order. Coordinate list is attached. + 2 file is in node-dump (checkpoint) + order but one file per node. + Coordinate list is attached + before each file. */ + + type32 n_spins; /* Number of source spins in this file */ + type32 spins[MAX_SOURCE_SPINS]; /* List of source spin indices in file */ + +} w_prop_header; + +/* 2. Site list (ONLY for checkpoint files - i.e. node-dump order files) + + A listing of site coordinates for the data in this file + in the order of appearance. The number of coordinates must + be exactly nx*ny*nz*nt. The site coordinate is encoded + as nx*(ny*(nz*t + z) + y) + x in a 32-bit integer. + + */ + +/* 3. Next comes a spin - color index and checksum */ + +typedef struct { + type32 spin; + type32 color; + type32 sum29; + type32 sum31; +} w_prop_check; + +/* 4. Finally the Wilson vectors appear */ + +/**********************************************************************/ +/* Info file format */ + +/* List of admissible keywords for version 5 ASCII lattice info file */ + +#ifdef CONTROL +char *w_prop_info_keyword[] = { + "magic_number", + "time_stamp", + "nx", + "ny", + "nz", + "nt", + "gauge.filename", + "gauge.time_stamp", + "gauge.checksums", + "gauge.fix.description", + "gauge.fix.tolerance", + "gauge.fix.filename", + "gauge.fix.time_stamp", + "gauge.fix.checksums", + "quark.description", + "quark.kappa", + "quark.clover.clov_c", + "quark.clover.u0", + "quark.boundary_condition", + "source.description", + "source.size", + "source.x", + "source.y", + "source.z", + "source.t", + "source.n_spins", + "source.spins", + "" /* Last entry MUST be a zero-length keyword */ +}; +#else +extern char *w_prop_info_keyword[]; +#endif + +/* Used to create info file name */ + +#define ASCII_W_PROP_INFO_EXT ".info" + +/**********************************************************************/ +/* 1996 Binary file format follows */ +/* Kept for compatibility */ + +#define MAX_GAUGE_FIELD_DESCRIPT 200 +#define MAX_GAUGE_FIELD_PARAM 2 +#define MAX_DIRAC_DESCRIPT 200 +#define MAX_DIRAC_PARAM 3 +#define MAX_SOURCE_DESCRIPT 200 +#define MAX_SOURCE_PARAM 2 +#define IDENTITY_MAP -1 +#define NO_MAP -2 +#define W_PROP_VERSION_NUMBER_1996 48291 + +/* Begin definition of header stuctures */ + +/* Note that an effort is made to make all radixing point and integer + fields 32 bits long. However, byte ordering may vary across + platforms, and no effort is made in writing the file to produce + a standard byte order. The input routines attempt to compensate + for byte reversal automatically, by examining the magic number at + the beginning of the file */ + +/* 1. Header comes first */ + +typedef struct { + type32 magic_number; /* Identifies file format */ + type32 dims[4]; /* Full lattice dimensions */ + type32 header_bytes; /* Number of bytes for data belonging to + this structure -- NOT necessarily + the length of this structure! */ + type32 order; /* 0 means no coordinate list is attached + and the values are in coordinate serial order + Nonzero means that a coordinate list is attached, + specifying the order of values */ + struct { /* Gauge field parameters */ + type32 n_descript; /* Number of bytes in character string */ + char descript[MAX_GAUGE_FIELD_DESCRIPT]; /* Describes gauge field */ + type32 n_param; /* Number of gauge field parameters */ + radix param[MAX_GAUGE_FIELD_PARAM]; /* GF parameters */ + } gauge_field; + struct { /* Dirac operator parameters */ + type32 n_descript; /* Number of bytes in character string */ + char descript[MAX_DIRAC_DESCRIPT]; /* Describes Dirac operator */ + type32 n_param; /* Number of Dirac operator parameters */ + radix param[MAX_DIRAC_PARAM]; /* Dirac parameters */ + } dirac; + struct { /* Source parameters */ + type32 n_descript; /* Number of bytes in character string */ + char descript[MAX_SOURCE_DESCRIPT]; /* Describes source */ + type32 n_param; /* Number of source parameters */ + struct { /* Source parameters */ + type32 i1; + radix c1; + } param; + type32 n_spins; /* Number of source spins in this file */ + type32 spins[MAX_SOURCE_SPINS]; /* List of source spin indices in file */ + } source; +} w_prop_header_1996 ; + +/* 2. Parallel files only: + + Next comes a listing of site coordinates for the data in this file + in the order of appearance. The number of coordinates must + be exactly nx*ny*nz*nt. The site coordinate is encoded + as nx*(ny*(nz*t + z) + y) + x in a 32-bit integer. + + Serial files only: + + The site order of propagator elements is required to be in subscript + order (x,y,z,t) with x varying most rapidly, followed by y, etc. + so this list is omitted. + + */ + +/* Next, repeat Items 3 and 4 for each source spin and color */ + +/* 3. Next comes a check structure to introduce the propagator components + for a given source spin and color and node number. */ + +EXTERN struct { + type32 spin; + type32 color; + type32 checksum; +} w_prop_check_1996; + +/* 4. Finally, the propagator Wilson vectors appear */ + +/*----------------------------------------------------------------------*/ + +/* File data structure */ + +typedef struct { + FILE * fp; /* File pointer */ + w_prop_header* header; /* Pointer to header for file */ + char * filename; /* Pointer to file name string */ + int byterevflag; /* Byte reverse flag - used only for reading */ + type32 * rank2rcv; /* File site list - used only for + serial reading */ + int parallel; /* 0 if file was opened for serial reading + 1 if opened for parallel reading */ + w_prop_check check; /* Current checksum, spin, color indices */ +} w_prop_file; + +/**********************************************************************/ +/* Declarations for I/O routines in io_wb.c */ + +w_prop_file *r_ascii_w_i(char *filename); +int r_ascii_w(w_prop_file *wpf, int spin, int color, field_offset src); +void r_ascii_w_f(w_prop_file *wpf); + +w_prop_file *r_serial_w_i(char *filename); +int r_serial_w(w_prop_file *wpf, int spin, int color, field_offset src); +void r_serial_w_f(w_prop_file *wpf); + +w_prop_file *r_parallel_w_i(char *filename); +void r_parallel_w_o(w_prop_file *wpf); +int r_parallel_w(w_prop_file *wpf, int spin, int color, field_offset src); +void r_parallel_w_c(w_prop_file *wpf); +void r_parallel_w_f(w_prop_file *wpf); + +w_prop_file *w_ascii_w_i(char *filename); +void w_ascii_w(w_prop_file *wpf, int spin, int color, field_offset src); +void w_ascii_w_f(w_prop_file *wpf); + +w_prop_file *w_serial_w_i(char *filename); +void w_serial_w(w_prop_file *wpf, int spin, int color, field_offset src); +void w_serial_w_f(w_prop_file *wpf); + +w_prop_file *w_parallel_w_i(char *filename); +void w_parallel_w_o(w_prop_file *wpf); +void w_parallel_w(w_prop_file *wpf, int spin, int color, field_offset src); +void w_parallel_w_c(w_prop_file *wpf); +void w_parallel_w_f(w_prop_file *wpf); + +w_prop_file *w_checkpoint_w_i(char *filename); +void w_checkpoint_w_o(w_prop_file *wpf); +void w_checkpoint_w(w_prop_file *wpf, int spin, int color, field_offset src); +void w_checkpoint_w_c(w_prop_file *wpf); +void w_checkpoint_w_f(w_prop_file *wpf); + +int write_w_prop_info_item( FILE *fpout, /* ascii file pointer */ + char *keyword, /* keyword */ + char *fmt, /* output format - + must use s, d, f, or e */ + void *src, /* address of starting data */ + int count, /* number of data items if > 1 */ + int stride); /* byte stride of data if + count > 1 */ +/**********************************************************************/ +/* In clover_info.c or wilson_info.c (application dependent) */ +void write_appl_w_prop_info(FILE *fp); + +/**********************************************************************/ +/* Prototypes for io_helpers_w.c */ +w_prop_file *r_open_prop(int flag, char *filename); +w_prop_file *w_open_prop(int flag, char *filename); +int reload_propagator( int flag, w_prop_file *wpf, + int spin, int color, field_offset dest, int timing); +void save_propagator( int flag, w_prop_file *wpf, + int spin, int color, field_offset src, int timing); +int ask_starting_prop( int prompt, int *flag, char *filename ); +int ask_ending_prop( int prompt, int *flag, char *filename ); +void r_close_prop(int flag, w_prop_file *wpf); +void w_close_prop(int flag, w_prop_file *wpf); + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amat_hwvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amat_hwvec.c new file mode 100644 index 0000000000000000000000000000000000000000..b5cb8381c2902dbed7cce7f5e32e75efca7d6405 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amat_hwvec.c @@ -0,0 +1,97 @@ +/************** m_amat_hwvec.c (in su3.a) ********************** +* * +* void mult_adj_su3_mat_hwvec( su3_matrix *mat, * +* half_wilson_vector *src,*dest ) * +* multiply a Wilson half-vector by the adjoint of a matrix * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST + +void mult_adj_su3_mat_hwvec( su3_matrix *mat, + half_wilson_vector *src, half_wilson_vector *dest ){ + mult_adj_su3_mat_vec(mat, &(src->h[0]), &(dest->h[0]) ); + mult_adj_su3_mat_vec(mat, &(src->h[1]), &(dest->h[1]) ); +} + +#else /* Fast version */ + +void mult_adj_su3_mat_hwvec( su3_matrix *mat, + half_wilson_vector *src, half_wilson_vector *dest ){ + +#ifdef NATIVEDOUBLE + register double a0r,a0i,a1r,a1i,a2r,a2i; + register double b0r,b0i,b1r,b1i,b2r,b2i; +#else + register radix a0r,a0i,a1r,a1i,a2r,a2i; + register radix b0r,b0i,b1r,b1i,b2r,b2i; +#endif + +/* mult_adj_su3_mat_vec(mat, &(src->h[0]), &(dest->h[0]) ); */ + + a0r=mat->e[0][0].real; a0i=mat->e[0][0].imag; + b0r=src->h[0].c[0].real; b0i=src->h[0].c[0].imag; + a1r=mat->e[1][0].real; a1i=mat->e[1][0].imag; + b1r=src->h[0].c[1].real; b1i=src->h[0].c[1].imag; + a2r=mat->e[2][0].real; a2i=mat->e[2][0].imag; + b2r=src->h[0].c[2].real; b2i=src->h[0].c[2].imag; + + dest->h[0].c[0].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + dest->h[0].c[0].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + + a0r=mat->e[0][1].real; a0i=mat->e[0][1].imag; + b0r=src->h[0].c[0].real; b0i=src->h[0].c[0].imag; + a1r=mat->e[1][1].real; a1i=mat->e[1][1].imag; + b1r=src->h[0].c[1].real; b1i=src->h[0].c[1].imag; + a2r=mat->e[2][1].real; a2i=mat->e[2][1].imag; + b2r=src->h[0].c[2].real; b2i=src->h[0].c[2].imag; + + dest->h[0].c[1].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + dest->h[0].c[1].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + + a0r=mat->e[0][2].real; a0i=mat->e[0][2].imag; + b0r=src->h[0].c[0].real; b0i=src->h[0].c[0].imag; + a1r=mat->e[1][2].real; a1i=mat->e[1][2].imag; + b1r=src->h[0].c[1].real; b1i=src->h[0].c[1].imag; + a2r=mat->e[2][2].real; a2i=mat->e[2][2].imag; + b2r=src->h[0].c[2].real; b2i=src->h[0].c[2].imag; + + dest->h[0].c[2].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + dest->h[0].c[2].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + + +/* mult_adj_su3_mat_vec(mat, &(src->h[1]), &(dest->h[1]) ); */ + + a0r=mat->e[0][0].real; a0i=mat->e[0][0].imag; + b0r=src->h[1].c[0].real; b0i=src->h[1].c[0].imag; + a1r=mat->e[1][0].real; a1i=mat->e[1][0].imag; + b1r=src->h[1].c[1].real; b1i=src->h[1].c[1].imag; + a2r=mat->e[2][0].real; a2i=mat->e[2][0].imag; + b2r=src->h[1].c[2].real; b2i=src->h[1].c[2].imag; + + dest->h[1].c[0].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + dest->h[1].c[0].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + + a0r=mat->e[0][1].real; a0i=mat->e[0][1].imag; + b0r=src->h[1].c[0].real; b0i=src->h[1].c[0].imag; + a1r=mat->e[1][1].real; a1i=mat->e[1][1].imag; + b1r=src->h[1].c[1].real; b1i=src->h[1].c[1].imag; + a2r=mat->e[2][1].real; a2i=mat->e[2][1].imag; + b2r=src->h[1].c[2].real; b2i=src->h[1].c[2].imag; + + dest->h[1].c[1].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + dest->h[1].c[1].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + + a0r=mat->e[0][2].real; a0i=mat->e[0][2].imag; + b0r=src->h[1].c[0].real; b0i=src->h[1].c[0].imag; + a1r=mat->e[1][2].real; a1i=mat->e[1][2].imag; + b1r=src->h[1].c[1].real; b1i=src->h[1].c[1].imag; + a2r=mat->e[2][2].real; a2i=mat->e[2][2].imag; + b2r=src->h[1].c[2].real; b2i=src->h[1].c[2].imag; + + dest->h[1].c[2].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + dest->h[1].c[2].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + +} +#endif /* "ifndef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amat_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amat_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..5fa5165a31d37622a04cd692db15a5daf016dd00 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amat_wvec.c @@ -0,0 +1,14 @@ +/*************** m_amat_wvec.c (in su3.a) ********************** +* * +* void mult_adj_mat_wilson_vec( su3_matrix *mat, * +* wilson_vector *src,*dest) * +* multiply a Wilson vector by the adjoint of a matrix * +*/ +#include "complex.h" +#include "su3.h" + +void mult_adj_mat_wilson_vec( su3_matrix *mat, wilson_vector *src, + wilson_vector *dest ){ + register int i; + for(i=0;i<4;i++)mult_adj_su3_mat_vec(mat, &(src->d[i]), &(dest->d[i]) ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amatvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amatvec.c new file mode 100644 index 0000000000000000000000000000000000000000..b8d925b99e0864b3c94d51c56bc129488cb251a4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amatvec.c @@ -0,0 +1,135 @@ +/***************** m_amatvec.c (in su3.a) ***************************** +* * +* void mult_adj_su3_mat_vec( su3_matrix *a, su3_vector *b,*c ) * +* C <- A_adjoint * B * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +/* adjoint matrix times vector multiply */ +void mult_adj_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c ){ +register int i,j; +register complex x,y,z; + for(i=0;i<3;i++){ + x.real=x.imag=0.0; + for(j=0;j<3;j++){ + CONJG( a->e[j][i], z ); + CMUL( z , b->c[j], y ) + CSUM( x , y ); + } + c->c[i] = x; + } +} + +#else +#ifdef NATIVEDOUBLE /* IBM RS6000 version */ +void mult_adj_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c ){ + + register double c0r,c0i,c1r,c1i,c2r,c2i; + register double br,bi,a0,a1,a2; + + br=b->c[0].real; bi=b->c[0].imag; + a0=a->e[0][0].real; + a1=a->e[0][1].real; + a2=a->e[0][2].real; + + c0r = a0*br; + c1r = a1*br; + c2r = a2*br; + c0i = a0*bi; + c1i = a1*bi; + c2i = a2*bi; + + a0=a->e[0][0].imag; + a1=a->e[0][1].imag; + a2=a->e[0][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + br=b->c[1].real; bi=b->c[1].imag; + a0=a->e[1][0].real; + a1=a->e[1][1].real; + a2=a->e[1][2].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[1][0].imag; + a1=a->e[1][1].imag; + a2=a->e[1][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + br=b->c[2].real; bi=b->c[2].imag; + a0=a->e[2][0].real; + a1=a->e[2][1].real; + a2=a->e[2][2].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[2][0].imag; + a1=a->e[2][1].imag; + a2=a->e[2][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + c->c[0].real = c0r; + c->c[0].imag = c0i; + c->c[1].real = c1r; + c->c[1].imag = c1i; + c->c[2].real = c2r; + c->c[2].imag = c2i; + +} +#else +void mult_adj_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c ){ + int i; + register radix t,ar,ai,br,bi,cr,ci; + for(i=0;i<3;i++){ + + ar=a->e[0][i].real; ai=a->e[0][i].imag; + br=b->c[0].real; bi=b->c[0].imag; + cr=ar*br; t=ai*bi; cr += t; + ci=ar*bi; t=ai*br; ci -= t; + + ar=a->e[1][i].real; ai=a->e[1][i].imag; + br=b->c[1].real; bi=b->c[1].imag; + t=ar*br; cr += t; t=ai*bi; cr += t; + t=ar*bi; ci += t; t=ai*br; ci -= t; + + ar=a->e[2][i].real; ai=a->e[2][i].imag; + br=b->c[2].real; bi=b->c[2].imag; + t=ar*br; cr += t; t=ai*bi; cr += t; + t=ar*bi; ci += t; t=ai*br; ci -= t; + + c->c[i].real=cr; + c->c[i].imag=ci; + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* End of "#ifndef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amatvec_ns.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amatvec_ns.c new file mode 100644 index 0000000000000000000000000000000000000000..d4f9d389ab91550b0275e2b5c9662cf7acecacda --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amatvec_ns.c @@ -0,0 +1,114 @@ +/****************** m_amatvec_ns.c (in su3.a) ************************* +* * +* void mult_adj_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b,*c ) * +* adjoint matrix times vector multiply and subtract from another vector * +* C <- C - A_adjoint*B * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +void mult_adj_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b, su3_vector *c ){ +register int i,j; +register complex x,y,z; + for(i=0;i<3;i++){ + x.real=x.imag=0.0; + for(j=0;j<3;j++){ + CONJG( a->e[j][i], z ); + CMUL( z , b->c[j], y ) + CSUM( x , y ); + } + c->c[i].real -= x.real; + c->c[i].imag -= x.imag; + } +} + +#else +void mult_adj_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b, su3_vector *c ){ + +#ifdef NATIVEDOUBLE + register double c0r,c0i,c1r,c1i,c2r,c2i; + register double br,bi,a0,a1,a2; +#else + register radix c0r,c0i,c1r,c1i,c2r,c2i; + register radix br,bi,a0,a1,a2; +#endif + + br=b->c[0].real; bi=b->c[0].imag; + a0=a->e[0][0].real; + a1=a->e[0][1].real; + a2=a->e[0][2].real; + + c0r = a0*br; + c1r = a1*br; + c2r = a2*br; + c0i = a0*bi; + c1i = a1*bi; + c2i = a2*bi; + + a0=a->e[0][0].imag; + a1=a->e[0][1].imag; + a2=a->e[0][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + br=b->c[1].real; bi=b->c[1].imag; + a0=a->e[1][0].real; + a1=a->e[1][1].real; + a2=a->e[1][2].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[1][0].imag; + a1=a->e[1][1].imag; + a2=a->e[1][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + br=b->c[2].real; bi=b->c[2].imag; + a0=a->e[2][0].real; + a1=a->e[2][1].real; + a2=a->e[2][2].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[2][0].imag; + a1=a->e[2][1].imag; + a2=a->e[2][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + c->c[0].real -= c0r; + c->c[0].imag -= c0i; + c->c[1].real -= c1r; + c->c[1].imag -= c1i; + c->c[2].real -= c2r; + c->c[2].imag -= c2i; + +} +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amatvec_s.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amatvec_s.c new file mode 100644 index 0000000000000000000000000000000000000000..0f090976cb5014364e6b608a161b527c29cc14a5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amatvec_s.c @@ -0,0 +1,114 @@ +/******************* m_amatvec_s.c (in su3.a) ************************* +* * +* void mult_adj_su3_mat_vec_sum( su3_matrix *a, su3_vector *b,*c ) * +* adjoint matrix times vector multiply and add to another vector * +* C <- C + A_adjoint*B * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +void mult_adj_su3_mat_vec_sum( su3_matrix *a, su3_vector *b, su3_vector *c ){ +register int i,j; +register complex x,y,z; + for(i=0;i<3;i++){ + x.real=x.imag=0.0; + for(j=0;j<3;j++){ + CONJG( a->e[j][i], z ); + CMUL( z , b->c[j], y ) + CSUM( x , y ); + } + c->c[i].real += x.real; + c->c[i].imag += x.imag; + } +} + +#else +void mult_adj_su3_mat_vec_sum( su3_matrix *a, su3_vector *b, su3_vector *c ){ + +#ifdef NATIVEDOUBLE + register double c0r,c0i,c1r,c1i,c2r,c2i; + register double br,bi,a0,a1,a2; +#else + register radix c0r,c0i,c1r,c1i,c2r,c2i; + register radix br,bi,a0,a1,a2; +#endif + + br=b->c[0].real; bi=b->c[0].imag; + a0=a->e[0][0].real; + a1=a->e[0][1].real; + a2=a->e[0][2].real; + + c0r = a0*br; + c1r = a1*br; + c2r = a2*br; + c0i = a0*bi; + c1i = a1*bi; + c2i = a2*bi; + + a0=a->e[0][0].imag; + a1=a->e[0][1].imag; + a2=a->e[0][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + br=b->c[1].real; bi=b->c[1].imag; + a0=a->e[1][0].real; + a1=a->e[1][1].real; + a2=a->e[1][2].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[1][0].imag; + a1=a->e[1][1].imag; + a2=a->e[1][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + br=b->c[2].real; bi=b->c[2].imag; + a0=a->e[2][0].real; + a1=a->e[2][1].real; + a2=a->e[2][2].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[2][0].imag; + a1=a->e[2][1].imag; + a2=a->e[2][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + c->c[0].real += c0r; + c->c[0].imag += c0i; + c->c[1].real += c1r; + c->c[1].imag += c1i; + c->c[2].real += c2r; + c->c[2].imag += c2i; +} + +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amv_4dir.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amv_4dir.c new file mode 100644 index 0000000000000000000000000000000000000000..816863dfbe5a87a684e2e46c3ad9b693765a5b34 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amv_4dir.c @@ -0,0 +1,117 @@ +/***************** m_amv_4dir.c (in su3.a) ***************************** +* * +* void mult_adj_su3_mat_vec_4dir( su3_matrix *mat, * +* su3_vector *src, su3_vector *dest ) * +* Multiply an su3_vector by an array of four adjoint su3_matrices, * +* result in an array of four su3_vectors. * +* dest[i] <- A_adjoint[i] * src * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +void mult_adj_su3_mat_vec_4dir( su3_matrix *mat, su3_vector *src, + su3_vector *dest ) { + mult_adj_su3_mat_vec( mat+0, src, dest+0 ); + mult_adj_su3_mat_vec( mat+1, src, dest+1 ); + mult_adj_su3_mat_vec( mat+2, src, dest+2 ); + mult_adj_su3_mat_vec( mat+3, src, dest+3 ); +} + +#else +/* Fast code, with subroutines inlined */ + +void mult_adj_su3_mat_vec_4dir( su3_matrix *mat, su3_vector *src, + su3_vector *dest ){ + register int n; +#ifdef NATIVEDOUBLE + register double c0r,c0i,c1r,c1i,c2r,c2i; + register double br,bi,a0,a1,a2; +#else + register radix c0r,c0i,c1r,c1i,c2r,c2i; + register radix br,bi,a0,a1,a2; +#endif + register su3_matrix *a; + register su3_vector *b,*c; + + a = mat; c = dest ; b = src; + for(n=0;n<4;n++,a++,c++){ + + br=b->c[0].real; bi=b->c[0].imag; + a0=a->e[0][0].real; + a1=a->e[0][1].real; + a2=a->e[0][2].real; + + c0r = a0*br; + c1r = a1*br; + c2r = a2*br; + c0i = a0*bi; + c1i = a1*bi; + c2i = a2*bi; + + a0=a->e[0][0].imag; + a1=a->e[0][1].imag; + a2=a->e[0][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + br=b->c[1].real; bi=b->c[1].imag; + a0=a->e[1][0].real; + a1=a->e[1][1].real; + a2=a->e[1][2].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[1][0].imag; + a1=a->e[1][1].imag; + a2=a->e[1][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + br=b->c[2].real; bi=b->c[2].imag; + a0=a->e[2][0].real; + a1=a->e[2][1].real; + a2=a->e[2][2].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[2][0].imag; + a1=a->e[2][1].imag; + a2=a->e[2][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + c->c[0].real = c0r; + c->c[0].imag = c0i; + c->c[1].real = c1r; + c->c[1].imag = c1i; + c->c[2].real = c2r; + c->c[2].imag = c2i; + } +} +#endif /* End of "#ifndef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amv_4dir_2.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amv_4dir_2.c new file mode 100644 index 0000000000000000000000000000000000000000..d2772bbd69740a282dc4c74816e354669988f445 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amv_4dir_2.c @@ -0,0 +1,19 @@ +/***************** m_amv_4dir_2.c (in su3.a) ***************************** +* * +* void mult_adj_su3_mat_vec_4dir_2( su3_matrix *mat, * +* su3_vector *src, su3_vector *dest ) * +* Multiply an su3_vector by an array of four adjoint su3_matrices, * +* result in an array of four su3_vectors. * +* dest[i] <- A_adjoint[i] * src * +*/ +#include "complex.h" +#include "su3.h" + +void mult_adj_su3_mat_vec_4dir_2( su3_matrix *mat, su3_vector *src, + su3_vector *xdest, su3_vector *ydest, su3_vector *zdest, + su3_vector *tdest ) { + mult_adj_su3_mat_vec( mat+0, src, xdest ); + mult_adj_su3_mat_vec( mat+1, src, ydest ); + mult_adj_su3_mat_vec( mat+2, src, zdest ); + mult_adj_su3_mat_vec( mat+3, src, tdest ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_an.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_an.c new file mode 100644 index 0000000000000000000000000000000000000000..a0e387382c85a49cd87750d78c45c2b1cc511ea9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_an.c @@ -0,0 +1,73 @@ +/****************** m_mat_an.c (in su3.a) ***************************** +* * +* void mult_su3_an( su3_matrix *a,*b,*c ) * +* matrix multiply, first matrix is adjoint * +* C <- A_adjoint*B * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +void mult_su3_an( su3_matrix *a, su3_matrix *b, su3_matrix *c ){ +register int i,j,k; +register complex x,y; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + x.real=x.imag=0.0; + for(k=0;k<3;k++){ + CMULJ_( a->e[k][i] , b->e[k][j], y ); + CSUM( x , y ); + } + c->e[i][j] = x; + } +} + +/* "Hand coded" routines, clearer coding is up above */ +#else + +void mult_su3_an( su3_matrix *a, su3_matrix *b, su3_matrix *c ){ + int j; + +#ifdef NATIVEDOUBLE + register double a0r,a0i,a1r,a1i,a2r,a2i; + register double b0r,b0i,b1r,b1i,b2r,b2i; +#else + register radix a0r,a0i,a1r,a1i,a2r,a2i; + register radix b0r,b0i,b1r,b1i,b2r,b2i; +#endif + + for(j=0;j<3;j++){ + + a0r=a->e[0][0].real; a0i=a->e[0][0].imag; + b0r=b->e[0][j].real; b0i=b->e[0][j].imag; + a1r=a->e[1][0].real; a1i=a->e[1][0].imag; + b1r=b->e[1][j].real; b1i=b->e[1][j].imag; + a2r=a->e[2][0].real; a2i=a->e[2][0].imag; + b2r=b->e[2][j].real; b2i=b->e[2][j].imag; + + c->e[0][j].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + c->e[0][j].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + + a0r=a->e[0][1].real; a0i=a->e[0][1].imag; + b0r=b->e[0][j].real; b0i=b->e[0][j].imag; + a1r=a->e[1][1].real; a1i=a->e[1][1].imag; + b1r=b->e[1][j].real; b1i=b->e[1][j].imag; + a2r=a->e[2][1].real; a2i=a->e[2][1].imag; + b2r=b->e[2][j].real; b2i=b->e[2][j].imag; + + c->e[1][j].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + c->e[1][j].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + + a0r=a->e[0][2].real; a0i=a->e[0][2].imag; + b0r=b->e[0][j].real; b0i=b->e[0][j].imag; + a1r=a->e[1][2].real; a1i=a->e[1][2].imag; + b1r=b->e[1][j].real; b1i=b->e[1][j].imag; + a2r=a->e[2][2].real; a2i=a->e[2][2].imag; + b2r=b->e[2][j].real; b2i=b->e[2][j].imag; + + c->e[2][j].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + c->e[2][j].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + + } +} + +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_hwvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_hwvec.c new file mode 100644 index 0000000000000000000000000000000000000000..ed60ea4919aa8d634cd60ae5c39fc5c1818bcc5b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_hwvec.c @@ -0,0 +1,99 @@ +/************** m_mat_hwvec.c (in su3.a) *********************** +* * +* void mult_su3_mat_hwvec(su3_matrix *mat, * +* half_wilson_vector *src,*dest) * +* multiply a Wilson half-vector by a matrix * +* dest <- mat*src * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST + +void mult_su3_mat_hwvec( su3_matrix *mat, half_wilson_vector *src, + half_wilson_vector *dest ){ + mult_su3_mat_vec(mat, &(src->h[0]), &(dest->h[0]) ); + mult_su3_mat_vec(mat, &(src->h[1]), &(dest->h[1]) ); +} + +#else /* Fast version */ + + +void mult_su3_mat_hwvec( su3_matrix *mat, half_wilson_vector *src, + half_wilson_vector *dest ){ + +#ifdef NATIVEDOUBLE + register double a0r,a0i,a1r,a1i,a2r,a2i; + register double b0r,b0i,b1r,b1i,b2r,b2i; +#else + register radix a0r,a0i,a1r,a1i,a2r,a2i; + register radix b0r,b0i,b1r,b1i,b2r,b2i; +#endif + +/* mult_su3_mat_vec(mat, &(src->h[0]), &(dest->h[0]) ); */ + + a0r=mat->e[0][0].real; a0i=mat->e[0][0].imag; + b0r=src->h[0].c[0].real; b0i=src->h[0].c[0].imag; + a1r=mat->e[0][1].real; a1i=mat->e[0][1].imag; + b1r=src->h[0].c[1].real; b1i=src->h[0].c[1].imag; + a2r=mat->e[0][2].real; a2i=mat->e[0][2].imag; + b2r=src->h[0].c[2].real; b2i=src->h[0].c[2].imag; + + dest->h[0].c[0].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + dest->h[0].c[0].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + + a0r=mat->e[1][0].real; a0i=mat->e[1][0].imag; + b0r=src->h[0].c[0].real; b0i=src->h[0].c[0].imag; + a1r=mat->e[1][1].real; a1i=mat->e[1][1].imag; + b1r=src->h[0].c[1].real; b1i=src->h[0].c[1].imag; + a2r=mat->e[1][2].real; a2i=mat->e[1][2].imag; + b2r=src->h[0].c[2].real; b2i=src->h[0].c[2].imag; + + dest->h[0].c[1].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + dest->h[0].c[1].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + + a0r=mat->e[2][0].real; a0i=mat->e[2][0].imag; + b0r=src->h[0].c[0].real; b0i=src->h[0].c[0].imag; + a1r=mat->e[2][1].real; a1i=mat->e[2][1].imag; + b1r=src->h[0].c[1].real; b1i=src->h[0].c[1].imag; + a2r=mat->e[2][2].real; a2i=mat->e[2][2].imag; + b2r=src->h[0].c[2].real; b2i=src->h[0].c[2].imag; + + dest->h[0].c[2].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + dest->h[0].c[2].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + +/* mult_su3_mat_vec(mat, &(src->h[1]), &(dest->h[1]) ); */ + + a0r=mat->e[0][0].real; a0i=mat->e[0][0].imag; + b0r=src->h[1].c[0].real; b0i=src->h[1].c[0].imag; + a1r=mat->e[0][1].real; a1i=mat->e[0][1].imag; + b1r=src->h[1].c[1].real; b1i=src->h[1].c[1].imag; + a2r=mat->e[0][2].real; a2i=mat->e[0][2].imag; + b2r=src->h[1].c[2].real; b2i=src->h[1].c[2].imag; + + dest->h[1].c[0].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + dest->h[1].c[0].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + + a0r=mat->e[1][0].real; a0i=mat->e[1][0].imag; + b0r=src->h[1].c[0].real; b0i=src->h[1].c[0].imag; + a1r=mat->e[1][1].real; a1i=mat->e[1][1].imag; + b1r=src->h[1].c[1].real; b1i=src->h[1].c[1].imag; + a2r=mat->e[1][2].real; a2i=mat->e[1][2].imag; + b2r=src->h[1].c[2].real; b2i=src->h[1].c[2].imag; + + dest->h[1].c[1].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + dest->h[1].c[1].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + + a0r=mat->e[2][0].real; a0i=mat->e[2][0].imag; + b0r=src->h[1].c[0].real; b0i=src->h[1].c[0].imag; + a1r=mat->e[2][1].real; a1i=mat->e[2][1].imag; + b1r=src->h[1].c[1].real; b1i=src->h[1].c[1].imag; + a2r=mat->e[2][2].real; a2i=mat->e[2][2].imag; + b2r=src->h[1].c[2].real; b2i=src->h[1].c[2].imag; + + dest->h[1].c[2].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + dest->h[1].c[2].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + +} + +#endif /* "ifndef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_na.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_na.c new file mode 100644 index 0000000000000000000000000000000000000000..5d35d08f49bea8f1dde9f0794fff6fec029fde10 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_na.c @@ -0,0 +1,52 @@ +/**************** m_mat_na.c (in su3.a) ******************************* +* * +* void mult_su3_na( su3_matrix *a,*b,*c ) * +* matrix multiply, second matrix is adjoint * +* C <- A*B_adjoint * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +void mult_su3_na( su3_matrix *a, su3_matrix *b, su3_matrix *c ){ +register int i,j,k; +register complex x,y; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + x.real=x.imag=0.0; + for(k=0;k<3;k++){ + CMUL_J( a->e[i][k] , b->e[j][k] , y ); + CSUM( x , y ); + } + c->e[i][j] = x; + } +} + +/* "Hand coded" routines, clearer coding is up above */ +#else + +void mult_su3_na( su3_matrix *a, su3_matrix *b, su3_matrix *c ){ +int i,j,k; +register radix t,ar,ai,br,bi,cr,ci; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + + ar=a->e[i][0].real; ai=a->e[i][0].imag; + br=b->e[j][0].real; bi=b->e[j][0].imag; + cr=ar*br; t=ai*bi; cr += t; + ci=ai*br; t=ar*bi; ci -= t; + + ar=a->e[i][1].real; ai=a->e[i][1].imag; + br=b->e[j][1].real; bi=b->e[j][1].imag; + t=ar*br; cr += t; t=ai*bi; cr += t; + t=ar*bi; ci -= t; t=ai*br; ci += t; + + ar=a->e[i][2].real; ai=a->e[i][2].imag; + br=b->e[j][2].real; bi=b->e[j][2].imag; + t=ar*br; cr += t; t=ai*bi; cr += t; + t=ar*bi; ci -= t; t=ai*br; ci += t; + + c->e[i][j].real=cr; + c->e[i][j].imag=ci; + } +} + +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_nn.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_nn.c new file mode 100644 index 0000000000000000000000000000000000000000..61313f151825ccca8fda666473d9bb15f1cac605 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_nn.c @@ -0,0 +1,94 @@ +/******************* m_mat_nn.c (in su3.a) **************************** +* * +* void mult_su3_nn( su3_matrix *a,*b,*c ) * +* matrix multiply, no adjoints * +* C <- A*B * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +void mult_su3_nn( su3_matrix *a, su3_matrix *b, su3_matrix *c ){ +register int i,j,k; +register complex x,y; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + x.real=x.imag=0.0; + for(k=0;k<3;k++){ + CMUL( a->e[i][k] , b->e[k][j] , y ); + CSUM( x , y ); + } + c->e[i][j] = x; + } +} + +/* "Hand coded" routines, clearer coding is up above */ +#else +#ifdef NATIVEDOUBLE /* RS6000 version */ + +void mult_su3_nn( su3_matrix *a, su3_matrix *b, su3_matrix *c ){ + int j; + register double a0r,a0i,a1r,a1i,a2r,a2i; + register double b0r,b0i,b1r,b1i,b2r,b2i; + + for(j=0;j<3;j++){ + + a0r=a->e[0][0].real; a0i=a->e[0][0].imag; + b0r=b->e[0][j].real; b0i=b->e[0][j].imag; + a1r=a->e[0][1].real; a1i=a->e[0][1].imag; + b1r=b->e[1][j].real; b1i=b->e[1][j].imag; + a2r=a->e[0][2].real; a2i=a->e[0][2].imag; + b2r=b->e[2][j].real; b2i=b->e[2][j].imag; + + c->e[0][j].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + c->e[0][j].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + + a0r=a->e[1][0].real; a0i=a->e[1][0].imag; + b0r=b->e[0][j].real; b0i=b->e[0][j].imag; + a1r=a->e[1][1].real; a1i=a->e[1][1].imag; + b1r=b->e[1][j].real; b1i=b->e[1][j].imag; + a2r=a->e[1][2].real; a2i=a->e[1][2].imag; + b2r=b->e[2][j].real; b2i=b->e[2][j].imag; + + c->e[1][j].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + c->e[1][j].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + + a0r=a->e[2][0].real; a0i=a->e[2][0].imag; + b0r=b->e[0][j].real; b0i=b->e[0][j].imag; + a1r=a->e[2][1].real; a1i=a->e[2][1].imag; + b1r=b->e[1][j].real; b1i=b->e[1][j].imag; + a2r=a->e[2][2].real; a2i=a->e[2][2].imag; + b2r=b->e[2][j].real; b2i=b->e[2][j].imag; + + c->e[2][j].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + c->e[2][j].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + + } +} +#else + +void mult_su3_nn( su3_matrix *a, su3_matrix *b, su3_matrix *c ){ + int i,j,k; + register radix t,ar,ai,br,bi,cr,ci; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + + ar=a->e[i][0].real; ai=a->e[i][0].imag; + br=b->e[0][j].real; bi=b->e[0][j].imag; + cr=ar*br; t=ai*bi; cr -= t; + ci=ar*bi; t=ai*br; ci += t; + + ar=a->e[i][1].real; ai=a->e[i][1].imag; + br=b->e[1][j].real; bi=b->e[1][j].imag; + t=ar*br; cr += t; t=ai*bi; cr -= t; + t=ar*bi; ci += t; t=ai*br; ci += t; + + ar=a->e[i][2].real; ai=a->e[i][2].imag; + br=b->e[2][j].real; bi=b->e[2][j].imag; + t=ar*br; cr += t; t=ai*bi; cr -= t; + t=ar*bi; ci += t; t=ai*br; ci += t; + + c->e[i][j].real=cr; + c->e[i][j].imag=ci; + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..fbdaa7be78a630f7acf40ac0c2e7ff31ba3de20b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_wvec.c @@ -0,0 +1,14 @@ +/****************** m_mat_wvec.c (in su3.a) ******************** +* * +*void mult_mat_wilson_vec(su3_matrix *mat, wilson_vector *src,*dest) * +* multiply a Wilson vector by a matrix * +* dest <- mat*src * +*/ +#include "complex.h" +#include "su3.h" + +void mult_mat_wilson_vec( su3_matrix *mat, wilson_vector *src, + wilson_vector *dest ){ + register int i; + for(i=0;i<4;i++)mult_su3_mat_vec(mat, &(src->d[i]), &(dest->d[i]) ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_matvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_matvec.c new file mode 100644 index 0000000000000000000000000000000000000000..aaff46ae5ac4496ab53f731676695e8f52ea9cf8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_matvec.c @@ -0,0 +1,88 @@ +/**************** m_matvec.c (in su3.a) ******************************* +* * +* void mult_su3_mat_vec( su3_matrix *a, su3_vector *b,*c ) * +* matrix times vector multiply, no adjoints * +* C <- A*B * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +void mult_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c ){ +register int i,j; +register complex x,y; + for(i=0;i<3;i++){ + x.real=x.imag=0.0; + for(j=0;j<3;j++){ + CMUL( a->e[i][j] , b->c[j] , y ) + CSUM( x , y ); + } + c->c[i] = x; + } +} +#else +#ifdef NATIVEDOUBLE /* RS6000 version */ +void mult_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c ){ + + register double a0r,a0i,a1r,a1i,a2r,a2i; + register double b0r,b0i,b1r,b1i,b2r,b2i; + + a0r=a->e[0][0].real; a0i=a->e[0][0].imag; + b0r=b->c[0].real; b0i=b->c[0].imag; + a1r=a->e[0][1].real; a1i=a->e[0][1].imag; + b1r=b->c[1].real; b1i=b->c[1].imag; + a2r=a->e[0][2].real; a2i=a->e[0][2].imag; + b2r=b->c[2].real; b2i=b->c[2].imag; + + c->c[0].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + c->c[0].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + + a0r=a->e[1][0].real; a0i=a->e[1][0].imag; + b0r=b->c[0].real; b0i=b->c[0].imag; + a1r=a->e[1][1].real; a1i=a->e[1][1].imag; + b1r=b->c[1].real; b1i=b->c[1].imag; + a2r=a->e[1][2].real; a2i=a->e[1][2].imag; + b2r=b->c[2].real; b2i=b->c[2].imag; + + c->c[1].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + c->c[1].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + + a0r=a->e[2][0].real; a0i=a->e[2][0].imag; + b0r=b->c[0].real; b0i=b->c[0].imag; + a1r=a->e[2][1].real; a1i=a->e[2][1].imag; + b1r=b->c[1].real; b1i=b->c[1].imag; + a2r=a->e[2][2].real; a2i=a->e[2][2].imag; + b2r=b->c[2].real; b2i=b->c[2].imag; + + c->c[2].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + c->c[2].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + +} + +#else +void mult_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c ){ +int i,j,k; +register radix t,ar,ai,br,bi,cr,ci; + for(i=0;i<3;i++){ + + ar=a->e[i][0].real; ai=a->e[i][0].imag; + br=b->c[0].real; bi=b->c[0].imag; + cr=ar*br; t=ai*bi; cr -= t; + ci=ar*bi; t=ai*br; ci += t; + + ar=a->e[i][1].real; ai=a->e[i][1].imag; + br=b->c[1].real; bi=b->c[1].imag; + t=ar*br; cr += t; t=ai*bi; cr -= t; + t=ar*bi; ci += t; t=ai*br; ci += t; + + ar=a->e[i][2].real; ai=a->e[i][2].imag; + br=b->c[2].real; bi=b->c[2].imag; + t=ar*br; cr += t; t=ai*bi; cr -= t; + t=ar*bi; ci += t; t=ai*br; ci += t; + + c->c[i].real=cr; + c->c[i].imag=ci; + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* End of "#infdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_matvec_ns.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_matvec_ns.c new file mode 100644 index 0000000000000000000000000000000000000000..9f32f8b5c5c53265e1bf1437745b262db9ef506f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_matvec_ns.c @@ -0,0 +1,146 @@ +/***************** m_matvec_ns.c (in su3.a) *************************** +* * +* void mult_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b,*c ) * +* su3_matrix times su3_vector multiply and subtract from another * +* su3_vector * +* C <- C - A*B * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +/* su3_matrix times su3_vector multiply and subtract from another su3_vector */ +/* c <- A*b-c */ +void mult_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b, su3_vector *c ){ +register int i,j; +register complex x,y; + for(i=0;i<3;i++){ + x.real=x.imag=0.0; + for(j=0;j<3;j++){ + CMUL( a->e[i][j] , b->c[j] , y ) + CSUM( x , y ); + } + c->c[i].real -= x.real; + c->c[i].imag -= x.imag; + } +} + +#else +#ifdef NATIVEDOUBLE +void mult_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b, su3_vector *c ){ + + register double c0r,c0i,c1r,c1i,c2r,c2i; + register double br,bi,a0,a1,a2; + + c0r = c->c[0].real; + c0i = c->c[0].imag; + c1r = c->c[1].real; + c1i = c->c[1].imag; + c2r = c->c[2].real; + c2i = c->c[2].imag; + + br=b->c[0].real; bi=b->c[0].imag; + a0=a->e[0][0].real; + a1=a->e[1][0].real; + a2=a->e[2][0].real; + + c0r -= a0*br; + c1r -= a1*br; + c2r -= a2*br; + c0i -= a0*bi; + c1i -= a1*bi; + c2i -= a2*bi; + + a0=a->e[0][0].imag; + a1=a->e[1][0].imag; + a2=a->e[2][0].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + br=b->c[1].real; bi=b->c[1].imag; + a0=a->e[0][1].real; + a1=a->e[1][1].real; + a2=a->e[2][1].real; + + c0r -= a0*br; + c1r -= a1*br; + c2r -= a2*br; + c0i -= a0*bi; + c1i -= a1*bi; + c2i -= a2*bi; + + a0=a->e[0][1].imag; + a1=a->e[1][1].imag; + a2=a->e[2][1].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + br=b->c[2].real; bi=b->c[2].imag; + a0=a->e[0][2].real; + a1=a->e[1][2].real; + a2=a->e[2][2].real; + + c0r -= a0*br; + c1r -= a1*br; + c2r -= a2*br; + c0i -= a0*bi; + c1i -= a1*bi; + c2i -= a2*bi; + + a0=a->e[0][2].imag; + a1=a->e[1][2].imag; + a2=a->e[2][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + c->c[0].real = c0r; + c->c[0].imag = c0i; + c->c[1].real = c1r; + c->c[1].imag = c1i; + c->c[2].real = c2r; + c->c[2].imag = c2i; + +} + +#else +void mult_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b, su3_vector *c ){ +int i,j,k; +register radix t,ar,ai,br,bi,cr,ci; + for(i=0;i<3;i++){ + + ar=a->e[i][0].real; ai=a->e[i][0].imag; + br=b->c[0].real; bi=b->c[0].imag; + cr=ar*br; t=ai*bi; cr -= t; + ci=ar*bi; t=ai*br; ci += t; + + ar=a->e[i][1].real; ai=a->e[i][1].imag; + br=b->c[1].real; bi=b->c[1].imag; + t=ar*br; cr += t; t=ai*bi; cr -= t; + t=ar*bi; ci += t; t=ai*br; ci += t; + + ar=a->e[i][2].real; ai=a->e[i][2].imag; + br=b->c[2].real; bi=b->c[2].imag; + t=ar*br; cr += t; t=ai*bi; cr -= t; + t=ar*bi; ci += t; t=ai*br; ci += t; + + c->c[i].real -= cr; + c->c[i].imag -= ci; + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_matvec_s.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_matvec_s.c new file mode 100644 index 0000000000000000000000000000000000000000..b1bcab0d94d5bd6618470882deb9fd028e7e73cc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_matvec_s.c @@ -0,0 +1,144 @@ +/**************** m_matvec_s.c (in su3.a) ***************************** +* * +* void mult_su3_mat_vec_sum( su3_matrix *a, su3_vector *b,*c ) * +* su3_matrix times su3_vector multiply and add to another su3_vector * +* C <- C + A*B * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +/* su3_matrix times su3_vector multiply and add to another su3_vector */ +/* c <- A*b+c */ +void mult_su3_mat_vec_sum( su3_matrix *a, su3_vector *b, su3_vector *c ){ +register int i,j; +register complex x,y; + for(i=0;i<3;i++){ + x.real=x.imag=0.0; + for(j=0;j<3;j++){ + CMUL( a->e[i][j] , b->c[j] , y ) + CSUM( x , y ); + } + c->c[i].real += x.real; + c->c[i].imag += x.imag; + } +} + +#else +#ifdef NATIVEDOUBLE /* RS6000 version */ +void mult_su3_mat_vec_sum(a,b,c) su3_matrix *a; su3_vector *b,*c; { + + register double c0r,c0i,c1r,c1i,c2r,c2i; + register double br,bi,a0,a1,a2; + + c0r = c->c[0].real; + c0i = c->c[0].imag; + c1r = c->c[1].real; + c1i = c->c[1].imag; + c2r = c->c[2].real; + c2i = c->c[2].imag; + + br=b->c[0].real; bi=b->c[0].imag; + a0=a->e[0][0].real; + a1=a->e[1][0].real; + a2=a->e[2][0].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[0][0].imag; + a1=a->e[1][0].imag; + a2=a->e[2][0].imag; + + c0r -= a0*bi; + c1r -= a1*bi; + c2r -= a2*bi; + c0i += a0*br; + c1i += a1*br; + c2i += a2*br; + + br=b->c[1].real; bi=b->c[1].imag; + a0=a->e[0][1].real; + a1=a->e[1][1].real; + a2=a->e[2][1].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[0][1].imag; + a1=a->e[1][1].imag; + a2=a->e[2][1].imag; + + c0r -= a0*bi; + c1r -= a1*bi; + c2r -= a2*bi; + c0i += a0*br; + c1i += a1*br; + c2i += a2*br; + + br=b->c[2].real; bi=b->c[2].imag; + a0=a->e[0][2].real; + a1=a->e[1][2].real; + a2=a->e[2][2].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[0][2].imag; + a1=a->e[1][2].imag; + a2=a->e[2][2].imag; + + c0r -= a0*bi; + c1r -= a1*bi; + c2r -= a2*bi; + c0i += a0*br; + c1i += a1*br; + c2i += a2*br; + + c->c[0].real = c0r; + c->c[0].imag = c0i; + c->c[1].real = c1r; + c->c[1].imag = c1i; + c->c[2].real = c2r; + c->c[2].imag = c2i; + +} +#else +void mult_su3_mat_vec_sum( su3_matrix *a, su3_vector *b, su3_vector *c ){ +int i,j,k; +register radix t,ar,ai,br,bi,cr,ci; + for(i=0;i<3;i++){ + + ar=a->e[i][0].real; ai=a->e[i][0].imag; + br=b->c[0].real; bi=b->c[0].imag; + cr=ar*br; t=ai*bi; cr -= t; + ci=ar*bi; t=ai*br; ci += t; + + ar=a->e[i][1].real; ai=a->e[i][1].imag; + br=b->c[1].real; bi=b->c[1].imag; + t=ar*br; cr += t; t=ai*bi; cr -= t; + t=ar*bi; ci += t; t=ai*br; ci += t; + + ar=a->e[i][2].real; ai=a->e[i][2].imag; + br=b->c[2].real; bi=b->c[2].imag; + t=ar*br; cr += t; t=ai*bi; cr -= t; + t=ar*bi; ci += t; t=ai*br; ci += t; + + c->c[i].real += cr; + c->c[i].imag += ci; + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mv_s_4dir.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mv_s_4dir.c new file mode 100644 index 0000000000000000000000000000000000000000..d8e928f382cc1b4985a0105837cf13c32dbf949f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mv_s_4dir.c @@ -0,0 +1,168 @@ +/**************** m_mv_s_4dir.c (in su3.a) ***************************** +* * +* void mult_su3_mat_vec_sum_4dir( su3_matrix *a, su3_vector *b[0123],*c )* +* Multiply the elements of an array of four su3_matrices by the * +* four su3_vectors, and add the results to * +* produce a single su3_vector. * +* C <- A[0]*B[0]+A[1]*B[1]+A[2]*B[2]+A[3]*B[3] * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +void mult_su3_mat_vec_sum_4dir( su3_matrix *a, su3_vector *b0, + su3_vector *b1, su3_vector *b2, su3_vector *b3, su3_vector *c ){ + mult_su3_mat_vec( a+0,b0,c ); + mult_su3_mat_vec_sum( a+1,b1,c ); + mult_su3_mat_vec_sum( a+2,b2,c ); + mult_su3_mat_vec_sum( a+3,b3,c ); +} + +#else +/* Fast code, with subroutines inlined */ +#ifdef NATIVEDOUBLE /* IBM RS6000 version */ +void mult_su3_mat_vec_sum_4dir( su3_matrix *a, su3_vector *b0, + su3_vector *b1, su3_vector *b2, su3_vector *b3, su3_vector *c ){ + + register int n; + register double c0r,c0i,c1r,c1i,c2r,c2i; + register double br,bi,a0,a1,a2; + register su3_matrix *mat; + register su3_vector *b; + + c0r = c0i = c1r = c1i = c2r = c2i = 0.0; + mat = a; + + for(n=0;n<4;n++,mat++){ + + switch(n){ + case(0): b=b0; break; + case(1): b=b1; break; + case(2): b=b2; break; + case(3): b=b3; break; + } + + br=b->c[0].real; bi=b->c[0].imag; + a0=mat->e[0][0].real; + a1=mat->e[1][0].real; + a2=mat->e[2][0].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=mat->e[0][0].imag; + a1=mat->e[1][0].imag; + a2=mat->e[2][0].imag; + + c0r -= a0*bi; + c1r -= a1*bi; + c2r -= a2*bi; + c0i += a0*br; + c1i += a1*br; + c2i += a2*br; + + br=b->c[1].real; bi=b->c[1].imag; + a0=mat->e[0][1].real; + a1=mat->e[1][1].real; + a2=mat->e[2][1].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=mat->e[0][1].imag; + a1=mat->e[1][1].imag; + a2=mat->e[2][1].imag; + + c0r -= a0*bi; + c1r -= a1*bi; + c2r -= a2*bi; + c0i += a0*br; + c1i += a1*br; + c2i += a2*br; + + br=b->c[2].real; bi=b->c[2].imag; + a0=mat->e[0][2].real; + a1=mat->e[1][2].real; + a2=mat->e[2][2].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=mat->e[0][2].imag; + a1=mat->e[1][2].imag; + a2=mat->e[2][2].imag; + + c0r -= a0*bi; + c1r -= a1*bi; + c2r -= a2*bi; + c0i += a0*br; + c1i += a1*br; + c2i += a2*br; + + } + + c->c[0].real = c0r; + c->c[0].imag = c0i; + c->c[1].real = c1r; + c->c[1].imag = c1i; + c->c[2].real = c2r; + c->c[2].imag = c2i; + +} + +#else +void mult_su3_mat_vec_sum_4dir( su3_matrix *a, su3_vector *b0, + su3_vector *b1, su3_vector *b2, su3_vector *b3, su3_vector *c ){ + int i,n; + register su3_matrix *at; + register su3_vector *b; + register radix t,ar,ai,br,bi,cr,ci; + + for(i=0;i<3;i++){ + c->c[i].real = 0.0; + c->c[i].imag = 0.0; + } + for(n=0;n<4;n++){ + at = a+n; + switch(n){ + case(0): b=b0; break; + case(1): b=b1; break; + case(2): b=b2; break; + case(3): b=b3; break; + } + for(i=0;i<3;i++){ + + ar=at->e[i][0].real; ai=at->e[i][0].imag; + br=b->c[0].real; bi=b->c[0].imag; + cr=ar*br; t=ai*bi; cr -= t; + ci=ar*bi; t=ai*br; ci += t; + + ar=at->e[i][1].real; ai=at->e[i][1].imag; + br=b->c[1].real; bi=b->c[1].imag; + t=ar*br; cr += t; t=ai*bi; cr -= t; + t=ar*bi; ci += t; t=ai*br; ci += t; + + ar=at->e[i][2].real; ai=at->e[i][2].imag; + br=b->c[2].real; bi=b->c[2].imag; + t=ar*br; cr += t; t=ai*bi; cr -= t; + t=ar*bi; ci += t; t=ai*br; ci += t; + + c->c[i].real += cr; + c->c[i].imag += ci; + } + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/macros.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/macros.h new file mode 100644 index 0000000000000000000000000000000000000000..47b314f61715b58fc469f79bc48720f5117d0094 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/macros.h @@ -0,0 +1,49 @@ +/* macros for "field offset" and "field pointer", used when fields + are arguments to subroutines */ +/* Usage: fo = F_OFFSET( field ), where "field" is the name of a field + in lattice. + address = F_PT( &site , fo ), where &site is the address of the + site and fo is a field_offset. Usually, the result will have to be + cast to a pointer to the appropriate type. (It is naturally a char *). +*/ +typedef int field_offset; +#define F_OFFSET(a) \ + ((field_offset)(((char *)&(lattice[0]. a ))-((char *)&(lattice[0])) )) +#define F_PT( site , fo ) ((char *)( site ) + (fo)) + +/* macros to loop over sites of a given parity. + Usage: + int i; + site *s; + FOREVENSITES(i,s){ + commands, where s is a pointer to the current site and i is + the index of the site on the node + } +*/ +#ifdef EVENFIRST +#define FOREVENSITES(i,s) \ + for(i=0,s=lattice;iparity==EVEN) +#define FORODDSITES(i,s) \ + for(i=0,s=lattice;iparity==ODD) +#define FORSOMEPARITY(i,s,choice) \ + for(i=0,s=lattice;iparity & (choice)) != 0) +#endif /* end ifdef EVENFIRST */ +#define FORALLSITES(i,s) \ + for(i=0,s=lattice;ie[0][0].imag + m3->e[1][1].imag + m3->e[2][2].imag)*0.33333333; + ah3->m00im = m3->e[0][0].imag - temp; + ah3->m11im = m3->e[1][1].imag - temp; + ah3->m22im = m3->e[2][2].imag - temp; + ah3->m01.real = (m3->e[0][1].real - m3->e[1][0].real)*0.5; + ah3->m02.real = (m3->e[0][2].real - m3->e[2][0].real)*0.5; + ah3->m12.real = (m3->e[1][2].real - m3->e[2][1].real)*0.5; + ah3->m01.imag = (m3->e[0][1].imag + m3->e[1][0].imag)*0.5; + ah3->m02.imag = (m3->e[0][2].imag + m3->e[2][0].imag)*0.5; + ah3->m12.imag = (m3->e[1][2].imag + m3->e[2][1].imag)*0.5; + +}/* make_anti_hermitian */ + +#else +void make_anti_hermitian( su3_matrix *m3, anti_hermitmat *ah3 ) { +radix temp,temp2; + + temp = + (m3->e[0][0].imag + m3->e[1][1].imag); + temp2 = temp + m3->e[2][2].imag; + temp = temp2*0.33333333; + ah3->m00im = m3->e[0][0].imag - temp; + ah3->m11im = m3->e[1][1].imag - temp; + ah3->m22im = m3->e[2][2].imag - temp; + temp = m3->e[0][1].real - m3->e[1][0].real; ah3->m01.real = temp*0.5; + temp = m3->e[0][2].real - m3->e[2][0].real; ah3->m02.real = temp*0.5; + temp = m3->e[1][2].real - m3->e[2][1].real; ah3->m12.real = temp*0.5; + temp = m3->e[0][1].imag + m3->e[1][0].imag; ah3->m01.imag = temp*0.5; + temp = m3->e[0][2].imag + m3->e[2][0].imag; ah3->m02.imag = temp*0.5; + temp = m3->e[1][2].imag + m3->e[2][1].imag; ah3->m12.imag = temp*0.5; + +}/* make_anti_hermitian */ +#endif /*end ifdef FAST */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/mb_gamma.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/mb_gamma.c new file mode 100644 index 0000000000000000000000000000000000000000..061d3d82836c306edc08ff5103fbd192feaf548d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/mb_gamma.c @@ -0,0 +1,104 @@ +/************* mb_gamma.c (in su3.a) **************************/ +/* + Multiply a Wilson vector by a gamma matrix + usage: mult_by_gamma( wilson_vector *src, wilson_vector *dest, int dir ) + dir = XUP, YUP, ZUP, TUP or GAMMAFIVE + + gamma(XUP) + 0 0 0 i + 0 0 i 0 + 0 -i 0 0 + -i 0 0 0 + + gamma(YUP) + 0 0 0 -1 + 0 0 1 0 + 0 1 0 0 + -1 0 0 0 + + gamma(ZUP) + 0 0 i 0 + 0 0 0 -i + -i 0 0 0 + 0 i 0 0 + + gamma(TUP) + 0 0 1 0 + 0 0 0 1 + 1 0 0 0 + 0 1 0 0 + + gamma(FIVE) + 1 0 0 0 + 0 1 0 0 + 0 0 -1 0 + 0 0 0 -1 +*/ +#include +#include "complex.h" +#include "su3.h" +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 7 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define TUP 3 +#define TDOWN 4 +#define ZDOWN 5 +#define YDOWN 6 +#define XDOWN 7 + +#define OPP_DIR(dir) (7-(dir)) /* Opposite direction */ +#define NDIRS 8 /* number of directions */ + +void mult_by_gamma( wilson_vector *src, wilson_vector *dest, int dir ){ + register int i; /*color*/ + + switch(dir){ + case XUP: + for(i=0;i<3;i++){ + TIMESPLUSI( src->d[3].c[i], dest->d[0].c[i] ); + TIMESPLUSI( src->d[2].c[i], dest->d[1].c[i] ); + TIMESMINUSI( src->d[1].c[i], dest->d[2].c[i] ); + TIMESMINUSI( src->d[0].c[i], dest->d[3].c[i] ); + } + break; + case YUP: + for(i=0;i<3;i++){ + TIMESMINUSONE( src->d[3].c[i], dest->d[0].c[i] ); + TIMESPLUSONE( src->d[2].c[i], dest->d[1].c[i] ); + TIMESPLUSONE( src->d[1].c[i], dest->d[2].c[i] ); + TIMESMINUSONE( src->d[0].c[i], dest->d[3].c[i] ); + } + break; + case ZUP: + for(i=0;i<3;i++){ + TIMESPLUSI( src->d[2].c[i], dest->d[0].c[i] ); + TIMESMINUSI( src->d[3].c[i], dest->d[1].c[i] ); + TIMESMINUSI( src->d[0].c[i], dest->d[2].c[i] ); + TIMESPLUSI( src->d[1].c[i], dest->d[3].c[i] ); + } + break; + case TUP: + for(i=0;i<3;i++){ + TIMESPLUSONE( src->d[2].c[i], dest->d[0].c[i] ); + TIMESPLUSONE( src->d[3].c[i], dest->d[1].c[i] ); + TIMESPLUSONE( src->d[0].c[i], dest->d[2].c[i] ); + TIMESPLUSONE( src->d[1].c[i], dest->d[3].c[i] ); + } + break; + case GAMMAFIVE: + for(i=0;i<3;i++){ + TIMESPLUSONE( src->d[0].c[i], dest->d[0].c[i] ); + TIMESPLUSONE( src->d[1].c[i], dest->d[1].c[i] ); + TIMESMINUSONE( src->d[2].c[i], dest->d[2].c[i] ); + TIMESMINUSONE( src->d[3].c[i], dest->d[3].c[i] ); + } + break; + default: + printf("BAD CALL TO MULT_BY_GAMMA()\n"); + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/mb_gamma_l.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/mb_gamma_l.c new file mode 100644 index 0000000000000000000000000000000000000000..e049e757e4f244cb35082b771f1476ca3b17f1f5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/mb_gamma_l.c @@ -0,0 +1,126 @@ +/************* mb_gamma_l.c (in su3.a) **************************/ +/* + Multiply a Wilson matrix by a gamma matrix acting on the row index + (This is the first index, or equivalently, multiplication on the left) + usage: mult_by_gamma_left( wilson_matrix *src, wilson_matrix *dest, int dir ) + dir = XUP, YUP, ZUP, TUP or GAMMAFIVE + + gamma(XUP) + 0 0 0 i + 0 0 i 0 + 0 -i 0 0 + -i 0 0 0 + + gamma(YUP) + 0 0 0 -1 + 0 0 1 0 + 0 1 0 0 + -1 0 0 0 + + gamma(ZUP) + 0 0 i 0 + 0 0 0 -i + -i 0 0 0 + 0 i 0 0 + + gamma(TUP) + 0 0 1 0 + 0 0 0 1 + 1 0 0 0 + 0 1 0 0 + + gamma(FIVE) + 1 0 0 0 + 0 1 0 0 + 0 0 -1 0 + 0 0 0 -1 +*/ +#include +#include "complex.h" +#include "su3.h" +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 7 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define TUP 3 +#define TDOWN 4 +#define ZDOWN 5 +#define YDOWN 6 +#define XDOWN 7 + +#define OPP_DIR(dir) (7-(dir)) /* Opposite direction */ +#define NDIRS 8 /* number of directions */ + +void mult_by_gamma_left( wilson_matrix *src, wilson_matrix *dest, int dir ){ + register int i; /*color*/ + register int c2,s2; /* column indices, color and spin */ + + switch(dir){ + case XUP: + for(i=0;i<3;i++)for(s2=0;s2<4;s2++)for(c2=0;c2<3;c2++){ + TIMESPLUSI( src->d[3].c[i].d[s2].c[c2], + dest->d[0].c[i].d[s2].c[c2] ); + TIMESPLUSI( src->d[2].c[i].d[s2].c[c2], + dest->d[1].c[i].d[s2].c[c2] ); + TIMESMINUSI( src->d[1].c[i].d[s2].c[c2], + dest->d[2].c[i].d[s2].c[c2] ); + TIMESMINUSI( src->d[0].c[i].d[s2].c[c2], + dest->d[3].c[i].d[s2].c[c2] ); + } + break; + case YUP: + for(i=0;i<3;i++)for(s2=0;s2<4;s2++)for(c2=0;c2<3;c2++){ + TIMESMINUSONE( src->d[3].c[i].d[s2].c[c2], + dest->d[0].c[i].d[s2].c[c2] ); + TIMESPLUSONE( src->d[2].c[i].d[s2].c[c2], + dest->d[1].c[i].d[s2].c[c2] ); + TIMESPLUSONE( src->d[1].c[i].d[s2].c[c2], + dest->d[2].c[i].d[s2].c[c2] ); + TIMESMINUSONE( src->d[0].c[i].d[s2].c[c2], + dest->d[3].c[i].d[s2].c[c2] ); + } + break; + case ZUP: + for(i=0;i<3;i++)for(s2=0;s2<4;s2++)for(c2=0;c2<3;c2++){ + TIMESPLUSI( src->d[2].c[i].d[s2].c[c2], + dest->d[0].c[i].d[s2].c[c2] ); + TIMESMINUSI( src->d[3].c[i].d[s2].c[c2], + dest->d[1].c[i].d[s2].c[c2] ); + TIMESMINUSI( src->d[0].c[i].d[s2].c[c2], + dest->d[2].c[i].d[s2].c[c2] ); + TIMESPLUSI( src->d[1].c[i].d[s2].c[c2], + dest->d[3].c[i].d[s2].c[c2] ); + } + break; + case TUP: + for(i=0;i<3;i++)for(s2=0;s2<4;s2++)for(c2=0;c2<3;c2++){ + TIMESPLUSONE( src->d[2].c[i].d[s2].c[c2], + dest->d[0].c[i].d[s2].c[c2] ); + TIMESPLUSONE( src->d[3].c[i].d[s2].c[c2], + dest->d[1].c[i].d[s2].c[c2] ); + TIMESPLUSONE( src->d[0].c[i].d[s2].c[c2], + dest->d[2].c[i].d[s2].c[c2] ); + TIMESPLUSONE( src->d[1].c[i].d[s2].c[c2], + dest->d[3].c[i].d[s2].c[c2] ); + } + break; + case GAMMAFIVE: + for(i=0;i<3;i++)for(s2=0;s2<4;s2++)for(c2=0;c2<3;c2++){ + TIMESPLUSONE( src->d[0].c[i].d[s2].c[c2], + dest->d[0].c[i].d[s2].c[c2] ); + TIMESPLUSONE( src->d[1].c[i].d[s2].c[c2], + dest->d[1].c[i].d[s2].c[c2] ); + TIMESMINUSONE( src->d[2].c[i].d[s2].c[c2], + dest->d[2].c[i].d[s2].c[c2] ); + TIMESMINUSONE( src->d[3].c[i].d[s2].c[c2], + dest->d[3].c[i].d[s2].c[c2] ); + } + break; + default: + printf("BAD CALL TO MULT_BY_GAMMA_LEFT()\n"); + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/mb_gamma_r.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/mb_gamma_r.c new file mode 100644 index 0000000000000000000000000000000000000000..28a286659a5425eab5f695b4a12df72988183fb0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/mb_gamma_r.c @@ -0,0 +1,127 @@ +/************* mb_gamma_r.c (in su3.a) **************************/ +/* + Multiply a Wilson matrix by a gamma matrix acting on the column index + (This is the second index, or equivalently, multiplication on the right) + usage: mult_by_gamma_right wilson_matrix *src, wilson_matrix *dest, + int dir ) + dir = XUP, YUP, ZUP, TUP or GAMMAFIVE + + gamma(XUP) + 0 0 0 i + 0 0 i 0 + 0 -i 0 0 + -i 0 0 0 + + gamma(YUP) + 0 0 0 -1 + 0 0 1 0 + 0 1 0 0 + -1 0 0 0 + + gamma(ZUP) + 0 0 i 0 + 0 0 0 -i + -i 0 0 0 + 0 i 0 0 + + gamma(TUP) + 0 0 1 0 + 0 0 0 1 + 1 0 0 0 + 0 1 0 0 + + gamma(FIVE) + 1 0 0 0 + 0 1 0 0 + 0 0 -1 0 + 0 0 0 -1 +*/ +#include +#include "complex.h" +#include "su3.h" +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 7 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define TUP 3 +#define TDOWN 4 +#define ZDOWN 5 +#define YDOWN 6 +#define XDOWN 7 + +#define OPP_DIR(dir) (7-(dir)) /* Opposite direction */ +#define NDIRS 8 /* number of directions */ + +void mult_by_gamma_right( wilson_matrix *src, wilson_matrix *dest, int dir ){ + register int i; /*color*/ + register int c1,s1; /* row indices, color and spin */ + + switch(dir){ + case XUP: + for(i=0;i<3;i++)for(s1=0;s1<4;s1++)for(c1=0;c1<3;c1++){ + TIMESMINUSI( src->d[s1].c[c1].d[3].c[i], + dest->d[s1].c[c1].d[0].c[i] ); + TIMESMINUSI( src->d[s1].c[c1].d[2].c[i], + dest->d[s1].c[c1].d[1].c[i] ); + TIMESPLUSI( src->d[s1].c[c1].d[1].c[i], + dest->d[s1].c[c1].d[2].c[i] ); + TIMESPLUSI( src->d[s1].c[c1].d[0].c[i], + dest->d[s1].c[c1].d[3].c[i] ); + } + break; + case YUP: + for(i=0;i<3;i++)for(s1=0;s1<4;s1++)for(c1=0;c1<3;c1++){ + TIMESMINUSONE( src->d[s1].c[c1].d[3].c[i], + dest->d[s1].c[c1].d[0].c[i] ); + TIMESPLUSONE( src->d[s1].c[c1].d[2].c[i], + dest->d[s1].c[c1].d[1].c[i] ); + TIMESPLUSONE( src->d[s1].c[c1].d[1].c[i], + dest->d[s1].c[c1].d[2].c[i] ); + TIMESMINUSONE( src->d[s1].c[c1].d[0].c[i], + dest->d[s1].c[c1].d[3].c[i] ); + } + break; + case ZUP: + for(i=0;i<3;i++)for(s1=0;s1<4;s1++)for(c1=0;c1<3;c1++){ + TIMESMINUSI( src->d[s1].c[c1].d[2].c[i], + dest->d[s1].c[c1].d[0].c[i] ); + TIMESPLUSI( src->d[s1].c[c1].d[3].c[i], + dest->d[s1].c[c1].d[1].c[i] ); + TIMESPLUSI( src->d[s1].c[c1].d[0].c[i], + dest->d[s1].c[c1].d[2].c[i] ); + TIMESMINUSI( src->d[s1].c[c1].d[1].c[i], + dest->d[s1].c[c1].d[3].c[i] ); + } + break; + case TUP: + for(i=0;i<3;i++)for(s1=0;s1<4;s1++)for(c1=0;c1<3;c1++){ + TIMESPLUSONE( src->d[s1].c[c1].d[2].c[i], + dest->d[s1].c[c1].d[0].c[i] ); + TIMESPLUSONE( src->d[s1].c[c1].d[3].c[i], + dest->d[s1].c[c1].d[1].c[i] ); + TIMESPLUSONE( src->d[s1].c[c1].d[0].c[i], + dest->d[s1].c[c1].d[2].c[i] ); + TIMESPLUSONE( src->d[s1].c[c1].d[1].c[i], + dest->d[s1].c[c1].d[3].c[i] ); + } + break; + case GAMMAFIVE: + for(i=0;i<3;i++)for(s1=0;s1<4;s1++)for(c1=0;c1<3;c1++){ + TIMESPLUSONE( src->d[s1].c[c1].d[0].c[i], + dest->d[s1].c[c1].d[0].c[i] ); + TIMESPLUSONE( src->d[s1].c[c1].d[1].c[i], + dest->d[s1].c[c1].d[1].c[i] ); + TIMESMINUSONE( src->d[s1].c[c1].d[2].c[i], + dest->d[s1].c[c1].d[2].c[i] ); + TIMESMINUSONE( src->d[s1].c[c1].d[3].c[i], + dest->d[s1].c[c1].d[3].c[i] ); + } + break; + default: + printf("BAD CALL TO MULT_BY_GAMMA_RIGHT()\n"); + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/msq_su3vec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/msq_su3vec.c new file mode 100644 index 0000000000000000000000000000000000000000..011826844e790aaf053a92a16a7e0f949fcee0a5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/msq_su3vec.c @@ -0,0 +1,48 @@ +/****************** magsq_su3vec.c (in su3.a) ****************************** +* * +* radix magsq_su3vec( su3_vector *a ) * +* return squared magnitude of an SU3 vector +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +radix magsq_su3vec( su3_vector *a ){ +register radix sum; +register int i; + for(i=0,sum=0.0;i<3;i++)sum += a->c[i].real*a->c[i].real + + a->c[i].imag*a->c[i].imag; + return(sum); +} + +#else +#ifdef NATIVEDOUBLE /* IBM RS6000 version */ +radix magsq_su3vec(su3_vector *a){ + + register double ar,ai,sum; + + ar=a->c[0].real; ai=a->c[0].imag; + sum = ar*ar + ai*ai; + + ar=a->c[1].real; ai=a->c[1].imag; + sum += ar*ar + ai*ai; + + ar=a->c[2].real; ai=a->c[2].imag; + sum += ar*ar + ai*ai; + + return((radix)sum); +} +#else +radix magsq_su3vec( su3_vector *a ){ +register radix temp,sum; + sum=0.0; + temp = a->c[0].real*a->c[0].real; sum += temp; + temp = a->c[0].imag*a->c[0].imag; sum += temp; + temp = a->c[1].real*a->c[1].real; sum += temp; + temp = a->c[1].imag*a->c[1].imag; sum += temp; + temp = a->c[2].real*a->c[2].real; sum += temp; + temp = a->c[2].imag*a->c[2].imag; sum += temp; + return(sum); +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* end ifdef FAST */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/msq_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/msq_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..05e1910dffdb7c220c7f0c93ac74d85f026dd169 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/msq_wvec.c @@ -0,0 +1,57 @@ + /******************** msq_wvec.c (in su3.a) ******************** +* +*radix msq_wvec(wilson_vector *vec) +* squared magnitude of a Wilson vector +* +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +radix magsq_wvec( wilson_vector *vec ){ + register int i; + register radix sum; + sum=0.0; + for(i=0;i<4;i++)sum += magsq_su3vec( &(vec->d[i]) ); + return(sum); + +#else /* Fast version */ +radix magsq_wvec( wilson_vector *vec ){ + +#ifdef NATIVEDOUBLE + register double ar,ai,sum; +#else + register radix ar,ai,sum; +#endif + + ar=vec->d[0].c[0].real; ai=vec->d[0].c[0].imag; + sum = ar*ar + ai*ai; + ar=vec->d[0].c[1].real; ai=vec->d[0].c[1].imag; + sum += ar*ar + ai*ai; + ar=vec->d[0].c[2].real; ai=vec->d[0].c[2].imag; + sum += ar*ar + ai*ai; + + ar=vec->d[1].c[0].real; ai=vec->d[1].c[0].imag; + sum += ar*ar + ai*ai; + ar=vec->d[1].c[1].real; ai=vec->d[1].c[1].imag; + sum += ar*ar + ai*ai; + ar=vec->d[1].c[2].real; ai=vec->d[1].c[2].imag; + sum += ar*ar + ai*ai; + + ar=vec->d[2].c[0].real; ai=vec->d[2].c[0].imag; + sum += ar*ar + ai*ai; + ar=vec->d[2].c[1].real; ai=vec->d[2].c[1].imag; + sum += ar*ar + ai*ai; + ar=vec->d[2].c[2].real; ai=vec->d[2].c[2].imag; + sum += ar*ar + ai*ai; + + ar=vec->d[3].c[0].real; ai=vec->d[3].c[0].imag; + sum += ar*ar + ai*ai; + ar=vec->d[3].c[1].real; ai=vec->d[3].c[1].imag; + sum += ar*ar + ai*ai; + ar=vec->d[3].c[2].real; ai=vec->d[3].c[2].imag; + sum += ar*ar + ai*ai; + + return((radix)sum); +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/pvm3.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/pvm3.h new file mode 100644 index 0000000000000000000000000000000000000000..f43a3c4524170484adeb39e54686a8e0c299f7b1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/pvm3.h @@ -0,0 +1,203 @@ + +/* + * PVM 3.0: Parallel Virtual Machine System 3.0 + * University of Tennessee, Knoxville TN. + * Oak Ridge National Laboratory, Oak Ridge TN. + * Emory University, Atlanta GA. + * Authors: A. L. Beguelin, J. J. Dongarra, G. A. Geist, + * R. J. Manchek, B. K. Moore, and V. S. Sunderam + * (C) 1992 All Rights Reserved + * + * NOTICE + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby granted + * provided that the above copyright notice appear in all copies and + * that both the copyright notice and this permission notice appear in + * supporting documentation. + * + * Neither the Institutions (Emory University, Oak Ridge National + * Laboratory, and University of Tennessee) nor the Authors make any + * representations about the suitability of this software for any + * purpose. This software is provided ``as is'' without express or + * implied warranty. + * + * PVM 3.0 was funded in part by the U.S. Department of Energy, the + * National Science Foundation and the State of Tennessee. + */ + +#ifndef _PVM3_H_ + +#define _PVM3_H_ + +/* +* Data packing styles for pvm_initsend() +*/ + +#define PvmDataDefault 0 +#define PvmDataRaw 1 +#define PvmDataInPlace 2 +#define PvmDataFoo 3 + +/* +* pvm_spawn options +*/ + +#define PvmTaskDefault 0 +#define PvmTaskHost 1 /* specify host */ +#define PvmTaskArch 2 /* specify architecture */ +#define PvmTaskDebug 4 /* start task in debugger */ + +/* +* pvm_notify types +*/ + +#define PvmTaskExit 1 /* on task exit */ +#define PvmHostDelete 2 /* on host fail/delete */ +#define PvmHostAdd 3 /* on host startup */ + +/* +* Libpvm error codes +*/ + +#define PvmOk 0 /* okay */ + /* reserve -1 */ +#define PvmBadParam -2 /* bad parameter (neg msg id, etc) */ +#define PvmMismatch -3 /* barrier count mismatch */ +#define PvmNoData -5 /* read past end of buffer */ +#define PvmNoHost -6 /* no such host */ +#define PvmNoFile -7 /* no such executable */ +#define PvmNoMem -10 /* can't get memory */ +#define PvmBadMsg -12 /* received msg can't be decoded */ +#define PvmSysErr -14 /* can't contact our pvmd/some system error */ +#define PvmNoBuf -15 /* no current buffer */ +#define PvmNoSuchBuf -16 /* bad message id */ +#define PvmNullGroup -17 /* null group name is illegal */ +#define PvmDupGroup -18 /* already in group */ +#define PvmNoGroup -19 /* no group with name */ +#define PvmNotInGroup -20 /* task not in group */ +#define PvmNoInst -21 /* no such instance in group */ +#define PvmHostFail -22 /* host failed */ +#define PvmNoParent -23 /* no parent task */ +#define PvmNotImpl -24 /* function not implemented */ +#define PvmDSysErr -25 /* pvmd system error */ +#define PvmBadVersion -26 /* pvmd-pvmd protocol version mismatch */ +#define PvmOutOfRes -27 /* out of resources */ +#define PvmDupHost -28 /* host already configured */ +#define PvmCantStart -29 /* failed to exec new slave pvmd */ +#define PvmAlready -30 /* already doing operation */ +#define PvmNoTask -31 /* no such task */ +#define PvmNoEntry -32 /* no such name, index pair */ +#define PvmDupEntry -33 /* name, index pair already exists */ + +/* +* returned by pvm_config() +*/ + +struct hostinfo { + int hi_tid; /* pvmd tid */ + char *hi_name; /* host name */ + char *hi_arch; /* host arch */ + int hi_mtu; /* max packet length */ + int hi_speed; /* cpu relative speed */ +}; + +/* +* returned by pvm_tasks() +*/ + +struct taskinfo { + int ti_tid; /* task id */ + int ti_ptid; /* parent tid */ + int ti_host; /* pvmd tid */ + int ti_flag; /* status flags */ + char *ti_a_out; /* a.out name */ +}; + + +#ifdef __ProtoGlarp__ +#undef __ProtoGlarp__ +#endif +#ifdef __STDC__ +#define __ProtoGlarp__(x) x +#else +#define __ProtoGlarp__(x) () +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +int pvm_addhosts __ProtoGlarp__(( char **names, int count, int *svp )); +int pvm_barrier __ProtoGlarp__(( char *group, int count )); +int pvm_bcast __ProtoGlarp__(( char *group, int code )); +int pvm_bufinfo __ProtoGlarp__(( int mid, int *len, int *code, int *tid )); +int pvm_config __ProtoGlarp__(( int *nhostp, int *narchp, + struct hostinfo **hostp )); +int pvm_delete __ProtoGlarp__(( char *name, int req )); +int pvm_delhosts __ProtoGlarp__(( char **names, int count, int *svp )); +int pvm_exit __ProtoGlarp__(( void )); +int pvm_freebuf __ProtoGlarp__(( int mid )); +int pvm_getinst __ProtoGlarp__(( char *group, int tid )); +int pvm_getrbuf __ProtoGlarp__(( void )); +int pvm_getsbuf __ProtoGlarp__(( void )); +int pvm_gettid __ProtoGlarp__(( char *group, int inst )); +int pvm_gsize __ProtoGlarp__(( char *group )); +int pvm_halt __ProtoGlarp__(( void )); +int pvm_initsend __ProtoGlarp__(( int encod )); +int pvm_insert __ProtoGlarp__(( char *name, int req, int data )); +int pvm_joingroup __ProtoGlarp__(( char *group )); +int pvm_kill __ProtoGlarp__(( int tid )); +int pvm_lookup __ProtoGlarp__(( char *name, int req, int *datap )); +int pvm_lvgroup __ProtoGlarp__(( char *group )); +int pvm_mcast __ProtoGlarp__(( int *tids, int count, int code )); +int pvm_mkbuf __ProtoGlarp__(( int encod )); +int pvm_mstat __ProtoGlarp__(( char *host )); +int pvm_mytid __ProtoGlarp__(( void )); +int pvm_notify __ProtoGlarp__(( int what, int code, + int count, int *tids )); +int pvm_nrecv __ProtoGlarp__(( int tid, int code )); +int pvm_parent __ProtoGlarp__(( void )); +int pvm_perror __ProtoGlarp__(( char *msg )); +int pvm_pkbyte __ProtoGlarp__(( char *cp, int cnt, int std )); +int pvm_pkcplx __ProtoGlarp__(( radix *xp, int cnt, int std )); +int pvm_pkdcplx __ProtoGlarp__(( double *zp, int cnt, int std )); +int pvm_pkdouble __ProtoGlarp__(( double *dp, int cnt, int std )); +int pvm_pkradix __ProtoGlarp__(( radix *fp, int cnt, int std )); +int pvm_pkint __ProtoGlarp__(( int *np, int cnt, int std )); +int pvm_pklong __ProtoGlarp__(( long *np, int cnt, int std )); +int pvm_pkshort __ProtoGlarp__(( short *np, int cnt, int std )); +int pvm_pkstr __ProtoGlarp__(( char *cp )); +int pvm_pstat __ProtoGlarp__(( int tid )); +int pvm_recv __ProtoGlarp__(( int tid, int code )); +int (*pvm_recvf __ProtoGlarp__(( int (*new)() )) )(); +int pvm_send __ProtoGlarp__(( int tid, int code )); +int pvm_sendsig __ProtoGlarp__(( int tid, int signum )); +int pvm_serror __ProtoGlarp__(( int how )); +int pvm_setdebug __ProtoGlarp__(( int mask )); +int pvm_setrbuf __ProtoGlarp__(( int mid )); +int pvm_setsbuf __ProtoGlarp__(( int mid )); +int pvm_spawn __ProtoGlarp__(( char *file, char **argv, int flags, + char *where, int count, int *tids )); +int pvm_start_pvmd __ProtoGlarp__(( int argc, char **argv )); +int pvm_tasks __ProtoGlarp__(( int where, int *ntaskp, + struct taskinfo **taskp )); +int pvm_tickle __ProtoGlarp__(( int how )); +int pvm_tidtohost __ProtoGlarp__(( int tid )); +int pvm_upkbyte __ProtoGlarp__(( char *cp, int cnt, int std )); +int pvm_upkcplx __ProtoGlarp__(( radix *xp, int cnt, int std )); +int pvm_upkdcplx __ProtoGlarp__(( double *zp, int cnt, int std )); +int pvm_upkdouble __ProtoGlarp__(( double *dp, int cnt, int std )); +int pvm_upkradix __ProtoGlarp__(( radix *fp, int cnt, int std )); +int pvm_upkint __ProtoGlarp__(( int *np, int cnt, int std )); +int pvm_upklong __ProtoGlarp__(( long *np, int cnt, int std )); +int pvm_upkshort __ProtoGlarp__(( short *np, int cnt, int std )); +int pvm_upkstr __ProtoGlarp__(( char *cp )); +char *pvm_version __ProtoGlarp__(( void )); + +#ifdef __cplusplus +} +#endif + +#endif /*_PVM3_H_*/ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/radix.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/radix.h new file mode 100644 index 0000000000000000000000000000000000000000..ac192dd3ca472af904227a2049dcc22aa0f4af98 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/radix.h @@ -0,0 +1,12 @@ +#ifndef RADIX +#define RADIX +/* this file just defines radix */ + +#define RADIX_F /* define symbol so that know radix is in use + * and is float + * another option: RADIX_D + */ + +typedef float radix; /* basic type */ + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/rand_ahmat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/rand_ahmat.c new file mode 100644 index 0000000000000000000000000000000000000000..06aec2236a6553386803b2dfd9dca539ca52a0d4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/rand_ahmat.c @@ -0,0 +1,31 @@ +/****************** rand_ahmat.c (in su3.a) *************************** +* * +* void random_anti_hermitian( anti_hermitmat *mat_antihermit, passthru *prn_pt)* +* Creates gaussian random anti-hermitian matrices * +* Normalization is < |m01|^2 > = 1, or < m01.real*m01.real > = 1/2 * +* The argument "prn_pt" is a pointer to be passed to gaussian_rand_no() * +* RS6000 may choke on void * * +*/ +typedef void passthru; +#include +#include "complex.h" +#include "su3.h" + +void random_anti_hermitian( anti_hermitmat *mat_antihermit, passthru *prn_pt) { +radix r3,r8; +radix sqrt_third; + + sqrt_third = sqrt( (double)(1.0/3.0) ); + r3=gaussian_rand_no(prn_pt); + r8=gaussian_rand_no(prn_pt); + mat_antihermit->m00im=r3+sqrt_third*r8; + mat_antihermit->m11im= -r3+sqrt_third*r8; + mat_antihermit->m22im= -2.0*sqrt_third*r8; + mat_antihermit->m01.real=gaussian_rand_no(prn_pt); + mat_antihermit->m02.real=gaussian_rand_no(prn_pt); + mat_antihermit->m12.real=gaussian_rand_no(prn_pt); + mat_antihermit->m01.imag=gaussian_rand_no(prn_pt); + mat_antihermit->m02.imag=gaussian_rand_no(prn_pt); + mat_antihermit->m12.imag=gaussian_rand_no(prn_pt); + +}/*random_anti_hermitian_*/ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/realtr.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/realtr.c new file mode 100644 index 0000000000000000000000000000000000000000..24c6f5289ada8c5df4697f1f06b32d295bd462f7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/realtr.c @@ -0,0 +1,15 @@ +/****************** realtr.c (in su3.a) ******************************* +* * +* radix realtrace_su3( su3_matrix *a,*b) * +* return Re( Tr( A_adjoint*B ) * +*/ +#include "complex.h" +#include "su3.h" + +radix realtrace_su3( su3_matrix *a, su3_matrix *b ){ +register int i,j; +register radix sum; + for(sum=0.0,i=0;i<3;i++)for(j=0;j<3;j++) + sum+= a->e[i][j].real*b->e[i][j].real + a->e[i][j].imag*b->e[i][j].imag; + return(sum); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_a_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_a_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..cc2cabdb6a1a8a2423d56f95e65b4270dac3e4d5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_a_mat.c @@ -0,0 +1,49 @@ +/**************** s_m_a_mat.c (in su3.a) ****************************** +* * +* void scalar_mult_add_su3_matrix( su3_matrix *a, su3_matrix *b, * +* radix s, su3_matrix *c) * +* C <- A + s*B * +*/ +#include "complex.h" +#include "su3.h" + +/* c <- a + s*b, matrices */ +void scalar_mult_add_su3_matrix(su3_matrix *a,su3_matrix *b,radix s, + su3_matrix *c){ + +#ifndef NATIVEDOUBLE +register int i,j; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + c->e[i][j].real = a->e[i][j].real + s*b->e[i][j].real; + c->e[i][j].imag = a->e[i][j].imag + s*b->e[i][j].imag; + } + +#else /* RS6000 version */ + + register double ss; + + ss = s; + + c->e[0][0].real = a->e[0][0].real + ss*b->e[0][0].real; + c->e[0][0].imag = a->e[0][0].imag + ss*b->e[0][0].imag; + c->e[0][1].real = a->e[0][1].real + ss*b->e[0][1].real; + c->e[0][1].imag = a->e[0][1].imag + ss*b->e[0][1].imag; + c->e[0][2].real = a->e[0][2].real + ss*b->e[0][2].real; + c->e[0][2].imag = a->e[0][2].imag + ss*b->e[0][2].imag; + + c->e[1][0].real = a->e[1][0].real + ss*b->e[1][0].real; + c->e[1][0].imag = a->e[1][0].imag + ss*b->e[1][0].imag; + c->e[1][1].real = a->e[1][1].real + ss*b->e[1][1].real; + c->e[1][1].imag = a->e[1][1].imag + ss*b->e[1][1].imag; + c->e[1][2].real = a->e[1][2].real + ss*b->e[1][2].real; + c->e[1][2].imag = a->e[1][2].imag + ss*b->e[1][2].imag; + + c->e[2][0].real = a->e[2][0].real + ss*b->e[2][0].real; + c->e[2][0].imag = a->e[2][0].imag + ss*b->e[2][0].imag; + c->e[2][1].real = a->e[2][1].real + ss*b->e[2][1].real; + c->e[2][1].imag = a->e[2][1].imag + ss*b->e[2][1].imag; + c->e[2][2].real = a->e[2][2].real + ss*b->e[2][2].real; + c->e[2][2].imag = a->e[2][2].imag + ss*b->e[2][2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_a_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_a_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..2a5090ffb2e3728b020749114789a1da463f56a2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_a_vec.c @@ -0,0 +1,36 @@ +/**************** s_m_a_vec.c (in su3.a) ****************************** +* * +* void scalar_mult_add_su3_vector( su3_vector *a, su3_vector *b, * +* radix s, su3_vector *c) * +* C <- A + s*B, A,B and C vectors * +*/ +#include "complex.h" +#include "su3.h" + +/* c <- a + s*b, vectors */ + +void scalar_mult_add_su3_vector(su3_vector *a, su3_vector *b, radix s, + su3_vector *c){ + +#ifndef NATIVEDOUBLE + register int i; + for(i=0;i<3;i++){ + c->c[i].real = a->c[i].real + s*b->c[i].real; + c->c[i].imag = a->c[i].imag + s*b->c[i].imag; + } + +#else /* RS6000 version */ + + register double ss; + + ss = s; + + c->c[0].real = a->c[0].real + ss*b->c[0].real; + c->c[0].imag = a->c[0].imag + ss*b->c[0].imag; + c->c[1].real = a->c[1].real + ss*b->c[1].real; + c->c[1].imag = a->c[1].imag + ss*b->c[1].imag; + c->c[2].real = a->c[2].real + ss*b->c[2].real; + c->c[2].imag = a->c[2].imag + ss*b->c[2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_a_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_a_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..6805599b797832a2333b5d4ac160fe682533fea9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_a_wvec.c @@ -0,0 +1,51 @@ +/******************** vol_s_m_a_wvec.c (in su3.a) ******************** +* +*void scalar_mult_add_wvec(wilson_vector *src1, wilson_vector *src2, + radix s, wilson_vector *dest) +* Multiply a Wilson vector by a scalar and add to another vector +* dest <- src1 + s*src2 +*/ + +#include "lattice.h" + +void scalar_mult_sum_wvec_V(wilson_vector s1[], wilson_vector s2[], radix ss, + wilson_vector d[]) +{ + int i; + wilson_vector *dest; + register wilson_vector src1,src2; + + for_active_sites(i) { + src1 = s1[i]; + src2 = s2[i]; + dest = &d[i] + + dest->d[0].c[0].real = src1.d[0].c[0].real + ss*src2.d[0].c[0].real; + dest->d[0].c[0].imag = src1.d[0].c[0].imag + ss*src2.d[0].c[0].imag; + dest->d[0].c[1].real = src1.d[0].c[1].real + ss*src2.d[0].c[1].real; + dest->d[0].c[1].imag = src1.d[0].c[1].imag + ss*src2.d[0].c[1].imag; + dest->d[0].c[2].real = src1.d[0].c[2].real + ss*src2.d[0].c[2].real; + dest->d[0].c[2].imag = src1.d[0].c[2].imag + ss*src2.d[0].c[2].imag; + + dest->d[1].c[0].real = src1.d[1].c[0].real + ss*src2.d[1].c[0].real; + dest->d[1].c[0].imag = src1.d[1].c[0].imag + ss*src2.d[1].c[0].imag; + dest->d[1].c[1].real = src1.d[1].c[1].real + ss*src2.d[1].c[1].real; + dest->d[1].c[1].imag = src1.d[1].c[1].imag + ss*src2.d[1].c[1].imag; + dest->d[1].c[2].real = src1.d[1].c[2].real + ss*src2.d[1].c[2].real; + dest->d[1].c[2].imag = src1.d[1].c[2].imag + ss*src2.d[1].c[2].imag; + + dest->d[2].c[0].real = src1.d[2].c[0].real + ss*src2.d[2].c[0].real; + dest->d[2].c[0].imag = src1.d[2].c[0].imag + ss*src2.d[2].c[0].imag; + dest->d[2].c[1].real = src1.d[2].c[1].real + ss*src2.d[2].c[1].real; + dest->d[2].c[1].imag = src1.d[2].c[1].imag + ss*src2.d[2].c[1].imag; + dest->d[2].c[2].real = src1.d[2].c[2].real + ss*src2.d[2].c[2].real; + dest->d[2].c[2].imag = src1.d[2].c[2].imag + ss*src2.d[2].c[2].imag; + + dest->d[3].c[0].real = src1.d[3].c[0].real + ss*src2.d[3].c[0].real; + dest->d[3].c[0].imag = src1.d[3].c[0].imag + ss*src2.d[3].c[0].imag; + dest->d[3].c[1].real = src1.d[3].c[1].real + ss*src2.d[3].c[1].real; + dest->d[3].c[1].imag = src1.d[3].c[1].imag + ss*src2.d[3].c[1].imag; + dest->d[3].c[2].real = src1.d[3].c[2].real + ss*src2.d[3].c[2].real; + dest->d[3].c[2].imag = src1.d[3].c[2].imag + ss*src2.d[3].c[2].imag; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_atm_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_atm_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..e18811e5c95e1174b12f4c57ff58c1ae7baf7fb7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_atm_wvec.c @@ -0,0 +1,61 @@ +/***************** s_m_atm_wvec.c (in su3.a) ******************** +* +*void scalar_mult_addtm_wvec(wilson_vector *src1, wilson_vector *src2, + radix s, wilson_vector *dest) +* Multiply a Wilson vector by a scalar and add to minus one times +* another vector +* dest <- (-1)*src1 + s*src2 +*/ +#include "complex.h" +#include "su3.h" + +void scalar_mult_addtm_wvec(wilson_vector *src1,wilson_vector *src2, + radix s,wilson_vector *dest){ + +#ifndef NATIVEDOUBLE + register int i,j; + for(i=0;i<4;i++){ /*spins*/ + for(j=0;j<3;j++){ /*colors*/ + dest->d[i].c[j].real = -src1->d[i].c[j].real + + s*src2->d[i].c[j].real; + dest->d[i].c[j].imag = -src1->d[i].c[j].imag + + s*src2->d[i].c[j].imag; + } + } + +#else /* RS6000 version */ + + register double ss; + ss = s; + + dest->d[0].c[0].real = -src1->d[0].c[0].real + ss*src2->d[0].c[0].real; + dest->d[0].c[0].imag = -src1->d[0].c[0].imag + ss*src2->d[0].c[0].imag; + dest->d[0].c[1].real = -src1->d[0].c[1].real + ss*src2->d[0].c[1].real; + dest->d[0].c[1].imag = -src1->d[0].c[1].imag + ss*src2->d[0].c[1].imag; + dest->d[0].c[2].real = -src1->d[0].c[2].real + ss*src2->d[0].c[2].real; + dest->d[0].c[2].imag = -src1->d[0].c[2].imag + ss*src2->d[0].c[2].imag; + + dest->d[1].c[0].real = -src1->d[1].c[0].real + ss*src2->d[1].c[0].real; + dest->d[1].c[0].imag = -src1->d[1].c[0].imag + ss*src2->d[1].c[0].imag; + dest->d[1].c[1].real = -src1->d[1].c[1].real + ss*src2->d[1].c[1].real; + dest->d[1].c[1].imag = -src1->d[1].c[1].imag + ss*src2->d[1].c[1].imag; + dest->d[1].c[2].real = -src1->d[1].c[2].real + ss*src2->d[1].c[2].real; + dest->d[1].c[2].imag = -src1->d[1].c[2].imag + ss*src2->d[1].c[2].imag; + + dest->d[2].c[0].real = -src1->d[2].c[0].real + ss*src2->d[2].c[0].real; + dest->d[2].c[0].imag = -src1->d[2].c[0].imag + ss*src2->d[2].c[0].imag; + dest->d[2].c[1].real = -src1->d[2].c[1].real + ss*src2->d[2].c[1].real; + dest->d[2].c[1].imag = -src1->d[2].c[1].imag + ss*src2->d[2].c[1].imag; + dest->d[2].c[2].real = -src1->d[2].c[2].real + ss*src2->d[2].c[2].real; + dest->d[2].c[2].imag = -src1->d[2].c[2].imag + ss*src2->d[2].c[2].imag; + + dest->d[3].c[0].real = -src1->d[3].c[0].real + ss*src2->d[3].c[0].real; + dest->d[3].c[0].imag = -src1->d[3].c[0].imag + ss*src2->d[3].c[0].imag; + dest->d[3].c[1].real = -src1->d[3].c[1].real + ss*src2->d[3].c[1].real; + dest->d[3].c[1].imag = -src1->d[3].c[1].imag + ss*src2->d[3].c[1].imag; + dest->d[3].c[2].real = -src1->d[3].c[2].real + ss*src2->d[3].c[2].real; + dest->d[3].c[2].imag = -src1->d[3].c[2].imag + ss*src2->d[3].c[2].imag; + +#endif +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_hwvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_hwvec.c new file mode 100644 index 0000000000000000000000000000000000000000..977b9853d97d04ee8cda8fa5047578451d5dec10 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_hwvec.c @@ -0,0 +1,38 @@ +/******************** s_m_hwvec.c (in su3.a) ******************** +* +*void scalar_mult_hwvec(half_wilson_vector *src, radix s, + half_wilson_vector *dest) +* Multiply a half Wilson vector by a scalar +* dest <- s*src +*/ +#include "complex.h" +#include "su3.h" + +void scalar_mult_hwvec( half_wilson_vector *src, radix s, + half_wilson_vector *dest ){ + +#ifndef NATIVEDOUBLE + register int i; + for(i=0;i<2;i++)scalar_mult_su3_vector( &(src->h[i]), s, &(dest->h[i])); + +#else /* RS6000 version */ + + register double ss; + ss = s; + + dest->h[0].c[0].real = ss*src->h[0].c[0].real; + dest->h[0].c[0].imag = ss*src->h[0].c[0].imag; + dest->h[0].c[1].real = ss*src->h[0].c[1].real; + dest->h[0].c[1].imag = ss*src->h[0].c[1].imag; + dest->h[0].c[2].real = ss*src->h[0].c[2].real; + dest->h[0].c[2].imag = ss*src->h[0].c[2].imag; + + dest->h[1].c[0].real = ss*src->h[1].c[0].real; + dest->h[1].c[0].imag = ss*src->h[1].c[0].imag; + dest->h[1].c[1].real = ss*src->h[1].c[1].real; + dest->h[1].c[1].imag = ss*src->h[1].c[1].imag; + dest->h[1].c[2].real = ss*src->h[1].c[2].real; + dest->h[1].c[2].imag = ss*src->h[1].c[2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..e79276418c66646b267cb2a543f4bb7b66c867be --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_mat.c @@ -0,0 +1,47 @@ +/****************** s_m_mat.c (in su3.a) ****************************** +* * +* void scalar_mult_su3_matrix( su3_matrix *a, radix s, su3_matrix *b) * +* B <- s*A * +*/ +#include "complex.h" +#include "su3.h" + +/* b <- s*a, matrices */ +void scalar_mult_su3_matrix( su3_matrix *a, radix s, su3_matrix *b ){ + +#ifndef NATIVEDOUBLE +register int i,j; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + b->e[i][j].real = s*a->e[i][j].real; + b->e[i][j].imag = s*a->e[i][j].imag; + } + +#else /* RS6000 version */ + + register double ss; + + ss = s; + + b->e[0][0].real = ss*a->e[0][0].real; + b->e[0][0].imag = ss*a->e[0][0].imag; + b->e[0][1].real = ss*a->e[0][1].real; + b->e[0][1].imag = ss*a->e[0][1].imag; + b->e[0][2].real = ss*a->e[0][2].real; + b->e[0][2].imag = ss*a->e[0][2].imag; + + b->e[1][0].real = ss*a->e[1][0].real; + b->e[1][0].imag = ss*a->e[1][0].imag; + b->e[1][1].real = ss*a->e[1][1].real; + b->e[1][1].imag = ss*a->e[1][1].imag; + b->e[1][2].real = ss*a->e[1][2].real; + b->e[1][2].imag = ss*a->e[1][2].imag; + + b->e[2][0].real = ss*a->e[2][0].real; + b->e[2][0].imag = ss*a->e[2][0].imag; + b->e[2][1].real = ss*a->e[2][1].real; + b->e[2][1].imag = ss*a->e[2][1].imag; + b->e[2][2].real = ss*a->e[2][2].real; + b->e[2][2].imag = ss*a->e[2][2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_s_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_s_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..81913a62a921f4160e0e2b0e7aa418fc2f1677cf --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_s_mat.c @@ -0,0 +1,49 @@ +/**************** s_m_s_mat.c (in su3.a) ****************************** +* * +* void scalar_mult_sub_su3_matrix( su3_matrix *a, su3_matrix *b, * +* radix s, su3_matrix *c) * +* C <- A - s*B, A,B and C matrices * +*/ +#include "complex.h" +#include "su3.h" + +/* c <- a - s*b, matrices */ +void scalar_mult_sub_su3_matrix(su3_matrix *a,su3_matrix *b,radix s, + su3_matrix *c){ + +#ifndef NATIVEDOUBLE +register int i,j; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + c->e[i][j].real = a->e[i][j].real - s*b->e[i][j].real; + c->e[i][j].imag = a->e[i][j].imag - s*b->e[i][j].imag; + } + +#else /* RS6000 version */ + + register double ss; + + ss = s; + + c->e[0][0].real = a->e[0][0].real - ss*b->e[0][0].real; + c->e[0][0].imag = a->e[0][0].imag - ss*b->e[0][0].imag; + c->e[0][1].real = a->e[0][1].real - ss*b->e[0][1].real; + c->e[0][1].imag = a->e[0][1].imag - ss*b->e[0][1].imag; + c->e[0][2].real = a->e[0][2].real - ss*b->e[0][2].real; + c->e[0][2].imag = a->e[0][2].imag - ss*b->e[0][2].imag; + + c->e[1][0].real = a->e[1][0].real - ss*b->e[1][0].real; + c->e[1][0].imag = a->e[1][0].imag - ss*b->e[1][0].imag; + c->e[1][1].real = a->e[1][1].real - ss*b->e[1][1].real; + c->e[1][1].imag = a->e[1][1].imag - ss*b->e[1][1].imag; + c->e[1][2].real = a->e[1][2].real - ss*b->e[1][2].real; + c->e[1][2].imag = a->e[1][2].imag - ss*b->e[1][2].imag; + + c->e[2][0].real = a->e[2][0].real - ss*b->e[2][0].real; + c->e[2][0].imag = a->e[2][0].imag - ss*b->e[2][0].imag; + c->e[2][1].real = a->e[2][1].real - ss*b->e[2][1].real; + c->e[2][1].imag = a->e[2][1].imag - ss*b->e[2][1].imag; + c->e[2][2].real = a->e[2][2].real - ss*b->e[2][2].real; + c->e[2][2].imag = a->e[2][2].imag - ss*b->e[2][2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_s_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_s_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..afa63246228d67effecee462aa85075133f1e409 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_s_vec.c @@ -0,0 +1,35 @@ +/***************** s_m_s_vec.c (in su3.a) ***************************** +* * +* void scalar_mult_sub_su3_vector( su3_vector *a, su3_vector *b, * +* radix s, su3_vector *c) * +* C <- A - s*B, A,B and C vectors * +*/ +#include "complex.h" +#include "su3.h" + +/* c <- a - s*b, vectors */ +void scalar_mult_sub_su3_vector(su3_vector *a,su3_vector *b,radix s, + su3_vector *c){ + +#ifndef NATIVEDOUBLE + register int i; + for(i=0;i<3;i++){ + c->c[i].real = a->c[i].real - s*b->c[i].real; + c->c[i].imag = a->c[i].imag - s*b->c[i].imag; + } + +#else /* RS6000 version */ + + register double ss; + + ss = s; + + c->c[0].real = a->c[0].real - ss*b->c[0].real; + c->c[0].imag = a->c[0].imag - ss*b->c[0].imag; + c->c[1].real = a->c[1].real - ss*b->c[1].real; + c->c[1].imag = a->c[1].imag - ss*b->c[1].imag; + c->c[2].real = a->c[2].real - ss*b->c[2].real; + c->c[2].imag = a->c[2].imag - ss*b->c[2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_sum_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_sum_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..3ecd401077d95ed1f659980a69d6e90715d4e6cf --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_sum_vec.c @@ -0,0 +1,33 @@ +/**************** s_m_sum_vec.c (in su3.a) **************************** +* * +* void scalar_mult_sum_su3_vector( su3_vector *a, su3_vector *b, radix s )* +* A <- A + s*B, A and B vectors * +*/ +#include "complex.h" +#include "su3.h" + +/* a <- a + s*b, vectors */ +void scalar_mult_sum_su3_vector(su3_vector *a, su3_vector *b, radix s){ + +#ifndef NATIVEDOUBLE +register int i; + for(i=0;i<3;i++){ + a->c[i].real += s*b->c[i].real; + a->c[i].imag += s*b->c[i].imag; + } + +#else /* RS6000 version */ + + register double ss; + + ss = s; + + a->c[0].real += ss*b->c[0].real; + a->c[0].imag += ss*b->c[0].imag; + a->c[1].real += ss*b->c[1].real; + a->c[1].imag += ss*b->c[1].imag; + a->c[2].real += ss*b->c[2].real; + a->c[2].imag += ss*b->c[2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..6ec0cf4bcade6f2dfe0c0521828f48f9d4068b73 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_vec.c @@ -0,0 +1,33 @@ +/****************** s_m_vec.c (in su3.a) ****************************** +* * +* void scalar_mult_su3_vector( su3_vector *a, radix s, su3_vector *c) * +* C <- s*A, A and C vectors * +*/ +#include "complex.h" +#include "su3.h" + +/* c <- s*a, vectors */ +void scalar_mult_su3_vector( su3_vector *a, radix s, su3_vector *c){ + +#ifndef NATIVEDOUBLE +register int i; + for(i=0;i<3;i++){ + c->c[i].real = s*a->c[i].real; + c->c[i].imag = s*a->c[i].imag; + } + +#else /* RS6000 version */ + + register double ss; + + ss = s; + + c->c[0].real = ss*a->c[0].real; + c->c[0].imag = ss*a->c[0].imag; + c->c[1].real = ss*a->c[1].real; + c->c[1].imag = ss*a->c[1].imag; + c->c[2].real = ss*a->c[2].real; + c->c[2].imag = ss*a->c[2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..e2b5c2b8a22552a7a2dad387f2fa4f5066a13ef1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_wvec.c @@ -0,0 +1,50 @@ +/******************** s_m_wvec.c (in su3.a) ******************** +* +*void scalar_mult_wvec(wilson_vector *src, radix s, wilson_vector *dest) +* Multiply a Wilson vector by a scalar +* dest <- s*src +*/ +#include "complex.h" +#include "su3.h" + +void scalar_mult_wvec( wilson_vector *src, radix s, wilson_vector *dest){ + +#ifndef NATIVEDOUBLE +register int i; + for(i=0;i<4;i++)scalar_mult_su3_vector( &(src->d[i]), s, &(dest->d[i])); + +#else /* RS6000 version */ + + register double ss; + ss = s; + + dest->d[0].c[0].real = ss*src->d[0].c[0].real; + dest->d[0].c[0].imag = ss*src->d[0].c[0].imag; + dest->d[0].c[1].real = ss*src->d[0].c[1].real; + dest->d[0].c[1].imag = ss*src->d[0].c[1].imag; + dest->d[0].c[2].real = ss*src->d[0].c[2].real; + dest->d[0].c[2].imag = ss*src->d[0].c[2].imag; + + dest->d[1].c[0].real = ss*src->d[1].c[0].real; + dest->d[1].c[0].imag = ss*src->d[1].c[0].imag; + dest->d[1].c[1].real = ss*src->d[1].c[1].real; + dest->d[1].c[1].imag = ss*src->d[1].c[1].imag; + dest->d[1].c[2].real = ss*src->d[1].c[2].real; + dest->d[1].c[2].imag = ss*src->d[1].c[2].imag; + + dest->d[2].c[0].real = ss*src->d[2].c[0].real; + dest->d[2].c[0].imag = ss*src->d[2].c[0].imag; + dest->d[2].c[1].real = ss*src->d[2].c[1].real; + dest->d[2].c[1].imag = ss*src->d[2].c[1].imag; + dest->d[2].c[2].real = ss*src->d[2].c[2].real; + dest->d[2].c[2].imag = ss*src->d[2].c[2].imag; + + dest->d[3].c[0].real = ss*src->d[3].c[0].real; + dest->d[3].c[0].imag = ss*src->d[3].c[0].imag; + dest->d[3].c[1].real = ss*src->d[3].c[1].real; + dest->d[3].c[1].imag = ss*src->d[3].c[1].imag; + dest->d[3].c[2].real = ss*src->d[3].c[2].real; + dest->d[3].c[2].imag = ss*src->d[3].c[2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3.h new file mode 100644 index 0000000000000000000000000000000000000000..0596ea46cf335de51d48ff6db545b5f36447fa72 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3.h @@ -0,0 +1,354 @@ +/****************************** su3.h ********************************** +* * +* Defines and subroutine declarations for SU3 simulation * +* MIMD version 3 * +* * +*/ +/* #define radix double takes radix from complex.h */ +typedef struct { complex e[3][3]; } su3_matrix; +typedef struct { complex c[3]; } su3_vector; +typedef struct + { complex m01,m02,m12; radix m00im,m11im,m22im; radix space; } anti_hermitmat; +typedef struct { su3_vector d[4]; } wilson_vector; +typedef struct { su3_vector h[2]; } half_wilson_vector; +typedef struct { wilson_vector c[3]; } color_wilson_vector; +typedef struct { color_wilson_vector d[4]; } wilson_matrix; + +typedef struct { radix l[8]; } adjoint_matrix; + +#define GAMMAFIVE -1 /* some integer which is not a direction */ +#define PLUS 1 /* flags for selecting M or M_adjoint */ +#define MINUS -1 +/* Macros to multiply complex numbers by +-1 and +-i */ +#define TIMESPLUSONE(a,b) { (b).real = (a).real; (b).imag = (a).imag; } +#define TIMESMINUSONE(a,b) { (b).real = -(a).real; (b).imag = -(a).imag; } +#define TIMESPLUSI(a,b) { (b).real = -(a).imag; (b).imag = (a).real; } +#define TIMESMINUSI(a,b) { (b).real = (a).imag; (b).imag = -(a).real; } + + +/* +* ROUTINES FOR SU(3) MATRIX OPERATIONS +* +* void mult_su3_nn( a,b,c ) +* su3_matrix *a,*b,*c; +* matrix multiply, no adjoints +* files "m_mat_nn.c", "m_mat_nn.m4" +* void mult_su3_na( a,b,c ) +* su3_matrix *a,*b,*c; +* matrix multiply, second matrix is adjoint +* files "m_mat_na.c", "m_mat_na.m4" +* void mult_su3_an( a,b,c ) +* su3_matrix *a,*b,*c; +* matrix multiply, first matrix is adjoint +* files "m_mat_an.c", "m_mat_an.m4" +* radix realtrace_su3(a,b) +* su3_matrix *a,*b; (Re(Tr( A_adjoint*B)) ) +* file "realtr.c" +* complex trace_su3(a) +* su3_matrix *a; +* file "trace_su3.c" +* complex complextrace_su3(a,b) +* su3_matrix *a,*b; (Tr( A_adjoint*B)) +* file "complextr.c" +* complex det_su3(a) +* su3_matrix *a; +* file "det_su3.c" +* void add_su3_matrix(a,b,c) +* su3_matrix *a,*b,*c; +* file "addmat.c" +* void sub_su3_matrix(a,b,c) +* su3_matrix *a,*b,*c; +* file "submat.c" +* void scalar_mult_su3_matrix(a,s,b) +* su3_matrix *a,*b; radix s; +* file "s_m_mat.c" +* void scalar_mult_add_su3_matrix(a,b,s,c) +* su3_matrix *a,*b,*c; radix s; +* file "s_m_a_mat.c" +* void scalar_mult_sub_su3_matrix(a,b,s,c) +* su3_matrix *a,*b,*c; radix s; +* file "s_m_s_mat.c" +* void c_scalar_mult_su3mat(m1,phase,m2) +* su3_matrix *m1,*m2; complex *phase; +* file "cs_m_mat.c" +* void c_scalar_mult_add_su3mat(m1,m2,phase,m3) +* su3_matrix *m1,*m2,*m3; complex *phase; +* file "cs_m_a_mat.c" +* void c_scalar_mult_sub_su3mat(m1,m2,phase,m3) +* su3_matrix *m1,*m2,*m3; complex *phase; +* file "cs_m_s_mat.c" +* void su3_adjoint(a,b) +* su3_matrix *a,*b; +* file "su3_adjoint.c" +* void make_anti_hermitian(m3,ah3) +* su3_matrix *m3; anti_hermitmat *ah3; +* file "make_ahmat.c" +* void random_anti_hermitian(mat_antihermit,prn_pt) +* anti_hermitmat *mat_antihermit; +* void *prn_pt; (passed through to myrand()) +* file "rand_ahmat.c" +* void uncompress_anti_hermitian(mat_antihermit,mat_su3) +* anti_hermitmat *mat_antihermit; su3_matrix *mat_su3; +* file "uncmp_ahmat.c" +* void compress_anti_hermitian(mat_su3,mat_antihermit) +* anti_hermitmat *mat_antihermit; su3_matrix *mat_su3; +* file "cmp_ahmat.c" +* void su3mat_copy(a,b) +* su3_matrix *a,*b; +* file "su3mat_copy.c" +* +* +* ROUTINES FOR su3_vector OPERATIONS ( 3 COMPONENT COMPLEX ) +* +* void c_scalar_mult_su3vec(v1,phase,v2) +* su3_vector *v1,*v2; complex *phase; +* file "cs_m_vec.c" +* void c_scalar_mult_add_su3vec(v1,phase,v2) +* su3_vector *v1,*v2; complex *phase; +* file "cs_m_a_vec.c" +* void c_scalar_mult_sub_su3vec(v1,phase,v2) +* su3_vector *v1,*v2; complex *phase; +* file "cs_m_s_vec.c" +* void su3_projector(a,b,c) +* su3_vector *a,*b; su3_matrix *c; +* ( outer product of A and B) +* file "su3_proj.c" +* void su3vec_copy(a,b) +* su3_vector *a,*b; +* file "su3vec_copy.c" +* +* void mult_su3_mat_vec( a,b,c ) +* su3_matrix *a; su3_vector *b,*c; +* file "m_matvec.c", "m_matvec.m4" +* void mult_su3_mat_vec_sum( a,b,c ) +* su3_matrix *a; su3_vector *b,*c; +* file "m_matvec_s.c", "m_matvec_s.m4" +* void mult_su3_mat_vec_sum_4dir( a,b0,b1,b2,b3,c ) +* su3_matrix *a; su3_vector *b0,*b1,*b2,*b3,*c; +* file "m_mv_s_4dir.c", "m_mv_s_4dir.m4" +* file "m_mv_s_4di2.m4" is alternate version with pipelined loads. +* Multiply four su3_vectors by elements of an array of su3_matrices, +* sum results. +* C <- A[0]*B0 + A[1]*B1 + A[2]*B2 + A[3]*B3 +* void mult_su3_mat_vec_nsum( a,b,c ) +* su3_matrix *a; su3_vector *b,*c; +* file "m_matvec_ns.c" +* void mult_adj_su3_mat_vec( a,b,c ) +* su3_matrix *a; su3_vector *b,*c; +* file "m_amatvec.c", "m_amatvec.m4" +* void mult_adj_su3_mat_vec_4dir( a,b,c ) +* su3_matrix *a; su3_vector *b,*c; +* file "m_amv_4dir.c", "m_amv_4dir.m4" +* file "m_amv_4di2.m4" is alternate version with pipelined loads. +* Multiply an su3_vector by adjoints of elements of an array +* of su3_matrices, results in an array of su3_vectors. +* C[i] <- A_adjoint[i]*B, i = 0,1,2,3 +* void mult_adj_su3_mat_vec_sum( a,b,c ) +* su3_matrix *a; su3_vector *b,*c; +* file "m_amatvec_s.c" +* void mult_adj_su3_mat_vec_nsum( a,b,c ) +* su3_matrix *a; su3_vector *b,*c; +* file "m_amatvec_ns.c" +* void add_su3_vector(a,b,c) +* su3_vector *a,*b,*c; +* file "addvec.c", "addvec.m4" +* void sub_su3_vector(a,b,c) +* su3_vector *a,*b,*c; +* file "subvec.c", "subvec.m4" +* void sub_four_su3_vecs(a,b1,b2,b3,b4) +* su3_vector *a,*b1,*b2,*b3,*b4; +* file "sub4vecs.c", "sub4vecs.m4" +* void scalar_mult_su3_vector(a,s,c) +* su3_vector *a,*c; radix s; +* file "s_m_vec.c" +* void scalar_mult_add_su3_vector(a,b,s,c) +* su3_vector *a,*b,*c; radix s; +* file "s_m_a_vec.c", "s_m_a_vec.m4" +* void scalar_mult_sum_su3_vector(a,b,s) +* su3_vector *a,*b; radix s; +* file "s_m_s_vec.c", "s_m_s_vec.m4" +* void scalar_mult_sub_su3_vector(a,b,s,c) +* su3_vector *a,*b,*c; radix s; +* file "s_m_s_vec.c" +* complex su3_dot(a,b) +* su3_vector *a,*b; +* file "su3_dot.c" +* radix su3_rdot(a,b) +* su3_vector *a,*b; +* file "su3_rdot.c", "su3_rdot.m4" +* radix magsq_su3vec(a) +* su3_vector *a; +* file "msq_su3vec.c", "msq_su3vec.m4" +* +* +* MISCELLANEOUS ROUTINES +* +* radix gaussian_rand_no(prn_pt) +* void *prn_pt; ( passed to myrand()) +* file "gaussrand.c" +* +* void dumpmat(m) +* su3_matrix *m; +* file "dumpmat.c" +* void dumpvec(v) +* su3_vector *v; +* file "dumpvec.c" +*/ + +/* Protoed by K.R */ + + +void mult_su3_nn (su3_matrix *, su3_matrix *, su3_matrix *); +void mult_su3_na (su3_matrix *, su3_matrix *, su3_matrix *); +void mult_su3_an (su3_matrix *, su3_matrix *, su3_matrix *); +radix realtrace_su3(su3_matrix *, su3_matrix *); +complex trace_su3(su3_matrix *); +complex complextrace_su3(su3_matrix *, su3_matrix *); +complex det_su3(su3_matrix *); +void add_su3_matrix(su3_matrix *, su3_matrix *, su3_matrix *); +void sub_su3_matrix(su3_matrix *, su3_matrix *, su3_matrix *); +void su3_adjoint(su3_matrix *, su3_matrix *); +void make_anti_hermitian(su3_matrix *, anti_hermitmat *ah3); +void random_anti_hermitian(anti_hermitmat *mat_antihermit,void *prn_pt); +void uncompress_anti_hermitian(anti_hermitmat *mat_antihermit,su3_matrix *mat_su3); +void compress_anti_hermitian(su3_matrix *mat_su3,anti_hermitmat *mat_antihermit); +void su3mat_copy(su3_matrix *, su3_matrix *); +void clear_su3mat( su3_matrix *dest ); +void clearvec( su3_vector *v ); + +void mult_su3_by_I(su3_matrix *,su3_matrix *); +void scalar_add_su3_matrix(su3_matrix *,radix , su3_matrix *); + +void c_scalar_mult_su3vec (su3_vector *, complex *,su3_vector *); +void c_scalar_mult_sub_su3vec(su3_vector *, complex *,su3_vector *); +void su3_projector(su3_vector *, su3_vector *, su3_matrix *); +void su3vec_copy(su3_vector *, su3_vector *); +void mult_su3_mat_vec(su3_matrix *,su3_vector *, su3_vector *); +void mult_su3_mat_vec_sum(su3_matrix *,su3_vector *, su3_vector *); +void mult_su3_mat_vec_sum_4dir(su3_matrix *,su3_vector *,su3_vector *, + su3_vector *,su3_vector *,su3_vector *); +void mult_su3_mat_vec_nsum(su3_matrix *,su3_vector *, su3_vector *); +void mult_adj_su3_mat_vec(su3_matrix *,su3_vector *, su3_vector *); +void mult_adj_su3_mat_vec_4dir(su3_matrix *,su3_vector *,su3_vector *); +void mult_adj_su3_mat_vec_sum(su3_matrix *,su3_vector *, su3_vector *); +void mult_adj_su3_mat_vec_nsum(su3_matrix *,su3_vector *, su3_vector *); +void add_su3_vector(su3_vector *,su3_vector *,su3_vector *); +void sub_su3_vector(su3_vector *,su3_vector *,su3_vector *); +void sub_four_su3_vecs(su3_vector *,su3_vector *,su3_vector *, + su3_vector *,su3_vector *); + +void scalar_mult_su3_vector( su3_vector *src, radix scalar, + su3_vector *dest); +void scalar_mult_add_su3_vector( su3_vector *src1, su3_vector *src2, + radix scalar, su3_vector *dest); +void scalar_mult_sum_su3_vector( su3_vector *src1, su3_vector *src2, + radix scalar); +void scalar_mult_sub_su3_vector( su3_vector *src1, su3_vector *src2, + radix scalar, su3_vector *dest); +void scalar_mult_su3_matrix( su3_matrix *src, radix scalar, + su3_matrix *dest); +void scalar_mult_add_su3_matrix( su3_matrix *src1, su3_matrix *src2, + radix scalar, su3_matrix *dest); +void scalar_mult_sub_su3_matrix( su3_matrix *src1, su3_matrix *src2, + radix scalar, su3_matrix *dest); +void c_scalar_mult_su3mat( su3_matrix *src, complex *scalar, + su3_matrix *dest); +void c_scalar_mult_add_su3mat( su3_matrix *src1, su3_matrix *src2, + complex *scalar, su3_matrix *dest); +void c_scalar_mult_sub_su3mat( su3_matrix *src1, su3_matrix *src2, + complex *scalar, su3_matrix *dest); +void scalar_mult_add_wvec( wilson_vector *src1, wilson_vector *src2, + radix scalar, wilson_vector *dest); +void scalar_mult_addtm_wvec( wilson_vector *src1, wilson_vector *src2, + radix scalar, wilson_vector *dest); +void c_scalar_mult_add_su3vec(su3_vector *v1, complex *phase, su3_vector +*v2); +void c_scalar_mult_add_wvec(wilson_vector *src1, wilson_vector *src2, complex *phase, wilson_vector *dest); + + +/* + * Adjoint Higgs protos + */ + +double act_gauge_adj(su3_matrix *a, su3_matrix *u,adjoint_matrix *b); +void compress_adjmat(su3_matrix *m3,adjoint_matrix *a3); +void uncompress_adjmat(adjoint_matrix *a3,su3_matrix *m3); +void make_adjointmat(su3_matrix *m3,adjoint_matrix *a3); +void add_adjmat(adjoint_matrix *a,adjoint_matrix *b,adjoint_matrix *t); +void adj_scalar_mul(adjoint_matrix *a,double d,adjoint_matrix *t); +void adj_scalar_mul_add(adjoint_matrix *a,double d,adjoint_matrix *t); +radix adj_sqr(adjoint_matrix *a); +radix adj_dot(adjoint_matrix *a,adjoint_matrix *b); +void mult_su3_ahiggs( su3_matrix *m, adjoint_matrix *a, adjoint_matrix *r ); +void mult_adj_su3_ahiggs( su3_matrix *m, adjoint_matrix *a, adjoint_matrix *r ); + +/* + * + * Added new ANSI protos -- Kari R. + * first, some Wilson operations + */ + +void mult_mat_wilson_vec( su3_matrix *mat, wilson_vector *src, + wilson_vector *dest); +void mult_su3_mat_hwvec( su3_matrix *mat, half_wilson_vector *src, + half_wilson_vector *dest); +void mult_adj_mat_wilson_vec( su3_matrix *mat, wilson_vector *src, + wilson_vector *dest); +void mult_adj_su3_mat_hwvec( su3_matrix *mat, + half_wilson_vector *src, half_wilson_vector *dest); +void add_wilson_vector( wilson_vector *src1, wilson_vector *src2, + wilson_vector *dest); +void sub_wilson_vector( wilson_vector *src1, wilson_vector *src2, + wilson_vector *dest); +void scalar_mult_wvec( wilson_vector *src, radix s, wilson_vector *dest); +void scalar_mult_hwvec( half_wilson_vector *src, radix s, + half_wilson_vector *dest); +radix magsq_wvec( wilson_vector *src); +complex wvec_dot( wilson_vector *src1, wilson_vector *src2 ); +complex wvec2_dot( wilson_vector *src1, wilson_vector *src2 ); +radix wvec_rdot( wilson_vector *src1, wilson_vector *src2 ); +void su3_projector_w(wilson_vector *a, wilson_vector *b, su3_matrix *c); +void copy_wvec( wilson_vector *src, wilson_vector *dest); +void clear_wvec( wilson_vector *dest); +void wp_shrink( wilson_vector *src, half_wilson_vector *dest,int dir, int sign); +void wp_shrink_4dir(wilson_vector *a,half_wilson_vector *b1, + half_wilson_vector *b2,half_wilson_vector *b3, + half_wilson_vector *b4,int sign); +void wp_grow( half_wilson_vector *src, wilson_vector *dest,int dir, int sign); +void wp_grow_add( half_wilson_vector *src, wilson_vector *dest, + int dir, int sign); +void grow_add_four_wvecs(wilson_vector *a,half_wilson_vector *b1, + half_wilson_vector *b2,half_wilson_vector *b3, + half_wilson_vector *b4,int sign,int sum); +void mult_by_gamma( wilson_vector *src, wilson_vector *dest, int dir ); +void mult_by_gamma_left( wilson_matrix *src, wilson_matrix *dest, int dir ); +void mult_by_gamma_right(wilson_matrix *src, wilson_matrix *dest, int dir ); +void dump_wilson_vec( wilson_vector *src); + +/* SOME SU3 PROTOS + * Kari R. + */ + +complex su3_dot(su3_vector *a,su3_vector *b); +radix su3_rdot(su3_vector *a,su3_vector *b); +radix magsq_su3vec(su3_vector *a); + +void reunit_su3( su3_matrix *l ); +void reunitarize( su3_matrix *link[] ); +void random_su3P( su3_matrix *l, int hits ); + + +/* + * MISCELLANEOUS ROUTINES + * Kari R. : wrote protos + */ + +radix gaussian_rand_no( void *); +void dumpmat(su3_matrix *m); +void dumpvec(su3_vector *v); + +int prefetch_matrix(su3_matrix *); +int prefetch_vector(su3_vector *); +int prefetch_adjoint(adjoint_matrix *); + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3.orig.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3.orig.h new file mode 100644 index 0000000000000000000000000000000000000000..e5610dfcdd9567649ed3827e2d2f6445b1c05640 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3.orig.h @@ -0,0 +1,446 @@ +/****************************** su3.h ********************************** +* * +* Defines and subroutine declarations for SU3 simulation * +* MIMD version 5 * +* * +*/ +typedef struct { complex e[3][3]; } su3_matrix; +typedef struct { complex c[3]; } su3_vector; +typedef struct + { complex m01,m02,m12; radix m00im,m11im,m22im; radix space; } anti_hermitmat; + +/* e.g. */ +/* wilson_propagator prop; */ +/* prop.c[ci].d[si].d[sf].c[cf] */ +/* -----------------------> complex */ +/* -----------------> su3_vector */ +/* -----------> wilson_vector */ +/* -----> spin_wilson_vector */ +/* e.g. */ +/* wilson_matrix matr; */ +/* matr.d[si].c[ci].d[sf].c[cf] */ +/* -----------------------> complex */ +/* -----------------> su3_vector */ +/* -----------> wilson_vector */ +/* -----> color_wilson_vector */ + +/* Object with two Dirac and two color indices. A given element + of a "wilson_propagator" is accessed by + object.c[color1].d[spin1].d[spin2].c[color2].real , etc. + As alway, "d" denotes a Dirac index and "c" a color index. + "1" refers to the source, "2" to the sink. +*/ + +typedef struct { su3_vector d[4]; } wilson_vector; +typedef struct { su3_vector h[2]; } half_wilson_vector; +typedef struct { wilson_vector c[3]; } color_wilson_vector; +typedef struct { wilson_vector d[4]; } spin_wilson_vector; +typedef struct { color_wilson_vector d[4]; } wilson_matrix; +typedef struct { spin_wilson_vector c[3]; } wilson_propagator; + +#define GAMMAFIVE -1 /* some integer which is not a direction */ +#define PLUS 1 /* flags for selecting M or M_adjoint */ +#define MINUS -1 +/* Macros to multiply complex numbers by +-1 and +-i */ +#define TIMESPLUSONE(a,b) { (b).real = (a).real; (b).imag = (a).imag; } +#define TIMESMINUSONE(a,b) { (b).real = -(a).real; (b).imag = -(a).imag; } +#define TIMESPLUSI(a,b) { (b).real = -(a).imag; (b).imag = (a).real; } +#define TIMESMINUSI(a,b) { (b).real = (a).imag; (b).imag = -(a).real; } + +/* random number routines */ +typedef struct { + /* We assume long is at least 32 bits */ + unsigned long r0,r1,r2,r3,r4,r5,r6; + unsigned long multiplier,addend,ic_state; + radix scale; +} double_prn; +void initialize_prn(double_prn *prn_pt, int seed, int index); +radix myrand( double_prn *prn_pt ); + + +/* +* ROUTINES FOR SU(3) MATRIX OPERATIONS +* +* void mult_su3_nn( su3_matrix *a, su3_matrix *b, su3_matrix *c ) +* matrix multiply, no adjoints +* files "m_mat_nn.c", "m_mat_nn.m4" +* void mult_su3_na( su3_matrix *a, su3_matrix *b, su3_matrix *c ) +* matrix multiply, second matrix is adjoint +* files "m_mat_na.c", "m_mat_na.m4" +* void mult_su3_an( su3_matrix *a, su3_matrix *b, su3_matrix *c ) +* matrix multiply, first matrix is adjoint +* files "m_mat_an.c", "m_mat_an.m4" +* radix realtrace_su3( su3_matrix *a, su3_matrix *b ) +* (Re(Tr( A_adjoint*B)) ) +* file "realtr.c" +* complex trace_su3( su3_matrix *a ) +* file "trace_su3.c" +* complex complextrace_su3( su3_matrix *a, su3_matrix *b ) +* (Tr( A_adjoint*B)) +* file "complextr.c" +* complex det_su3( su3_matrix *a ) +* file "det_su3.c" +* void add_su3_matrix( su3_matrix *a, su3_matrix *b, su3_matrix *c ) +* file "addmat.c" +* void sub_su3_matrix( su3_matrix *a, su3_matrix *b, su3_matrix *c ) +* file "submat.c" +* void scalar_mult_su3_matrix( su3_matrix *a, radix s, su3_matrix *b ) +* file "s_m_mat.c" +* void scalar_mult_add_su3_matrix( su3_matrix *a, su3_matrix *b, +* radix s, su3_matrix *c) +* file "s_m_a_mat.c" +* void scalar_mult_sub_su3_matrix( su3_matrix *a, su3_matrix *b, +* radix s, su3_matrix *c) +* file "s_m_s_mat.c" +* void c_scalar_mult_su3mat( su3_matrix *src, complex *phase, su3_matrix *dest) +* file "cs_m_mat.c" +* void c_scalar_mult_add_su3mat( su3_matrix *m1, su3_matrix *m2, +* complex *phase, su3_matrix *m3) +* file "cs_m_a_mat.c" +* void c_scalar_mult_sub_su3mat( su3_matrix *m1, su3_matrix *m2, +* complex *phase, su3_matrix *m3) +* file "cs_m_s_mat.c" +* void su3_adjoint( su3_matrix *a, su3_matrix *b ) +* file "su3_adjoint.c" +* void make_anti_hermitian( su3_matrix *m3, anti_hermitmat *ah3 ) +* file "make_ahmat.c" +* void random_anti_hermitian( anti_hermitmat *mat_antihermit, void *prn_pt ) +* (prn_pt passed through to myrand()) +* file "rand_ahmat.c" +* void uncompress_anti_hermitian( anti_hermitmat *mat_anti, su3_matrix *mat ) +* file "uncmp_ahmat.c" +* void compress_anti_hermitian( su3_matrix *mat, anti_hermitmat *mat_anti) +* file "cmp_ahmat.c" +* void clear_su3mat( su3_matrix *dest ); +* file clear_mat.c +* dest <- 0.0 +* void su3mat_copy( su3_matrix *a, su3_matrix *b ) +* file "su3mat_copy.c" +* void dumpmat( su3_matrix *m ) +* file "dumpmat.c" +* +* +* ROUTINES FOR su3_vector OPERATIONS ( 3 COMPONENT COMPLEX ) +* +* void su3_projector( su3_vector *a, su3_vector *b, su3_matrix *c ) +* ( outer product of A and B) +* file "su3_proj.c" +* complex su3_dot( su3_vector *a, su3_vector *b ) +* file "su3_dot.c" +* radix su3_rdot( su3_vector *a, su3_vector *b ) +* file "su3_rdot.c", "su3_rdot.m4" +* radix magsq_su3vec( su3_vector *a ) +* file "msq_su3vec.c", "msq_su3vec.m4" +* void su3vec_copy( su3_vector *a, su3_vector *b ) +* file "su3vec_copy.c" +* +* void mult_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c ) +* C <- A*B +* file "m_matvec.c", "m_matvec.m4" +* void mult_su3_mat_vec_sum( su3_matrix *a, su3_vector *b, su3_vector *c ) +* C <- C + A*B +* file "m_matvec_s.c", "m_matvec_s.m4" +* void mult_su3_mat_vec_sum_4dir( su3_matrix *a, su3_vector *b0, +* su3_vector *b1, su3_vector *b2, su3_vector *b3, su3_vector *c ) +* file "m_mv_s_4dir.c", "m_mv_s_4dir.m4" +* file "m_mv_s_4di2.m4" is alternate version with pipelined loads. +* Multiply four su3_vectors by elements of an array of su3_matrices, +* sum results. +* C <- A[0]*B0 + A[1]*B1 + A[2]*B2 + A[3]*B3 +* void mult_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b, su3_vector *c ) +* file "m_matvec_ns.c" +* void mult_adj_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c ) +* file "m_amatvec.c", "m_amatvec.m4" +* void mult_adj_su3_mat_vec_4dir( su3_matrix *a, su3_vector *b, su3_vector *c ) +* file "m_amv_4dir.c", "m_amv_4dir.m4" +* file "m_amv_4di2.m4" is alternate version with pipelined loads. +* Multiply an su3_vector by adjoints of elements of an array +* of su3_matrices, results in an array of su3_vectors. +* C[i] <- A_adjoint[i]*B, i = 0,1,2,3 +* void mult_adj_su3_mat_vec_sum( su3_matrix *a, su3_vector *b, su3_vector *c ) +* file "m_amatvec_s.c" +* void mult_adj_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b, su3_vector *c ) +* file "m_amatvec_ns.c" +* void add_su3_vector( su3_vector *a, su3_vector *b, su3_vector *c ) +* file "addvec.c", "addvec.m4" +* void sub_su3_vector( su3_vector *a, su3_vector *b, su3_vector *c ) +* file "subvec.c", "subvec.m4" +* void sub_four_su3_vecs( su3_vector *a, su3_vector *b1, su3_vector *b2, +* su3_vector *b3, su3_vector *b4 ) +* file "sub4vecs.c", "sub4vecs.m4" +* +* void scalar_mult_su3_vector( su3_vector *a, radix s, su3_vector *c ) +* file "s_m_vec.c" +* void scalar_mult_add_su3_vector( su3_vector *a, su3_vector *b, radix s, +* su3_vector *c) +* file "s_m_a_vec.c", "s_m_a_vec.m4" +* void scalar_mult_sum_su3_vector( su3_vector *a, su3_vector *b, radix s ) +* file "s_m_sum_vec.c", "s_m_sum_vec.m4" +* void scalar_mult_sub_su3_vector( su3_vector *a, su3_vector *b, radix s, +* su3_vector *c ) +* file "s_m_s_vec.c" +* void c_scalar_mult_su3vec( su3_vector *src, complex *phase, su3_vector *dest ) +* file "cs_m_vec.c" +* void c_scalar_mult_add_su3vec( su3_vector *v1, complex *phase, su3_vector *v2) +* file "cs_m_a_vec.c" +* void c_scalar_mult_sub_su3vec( su3_vector *v1, complex *phase, su3_vector *v2) +* file "cs_m_s_vec.c" +* void dumpvec( su3_vector *v ) +* file "dumpvec.c" +* void clearvec( su3_vector *v ) +* file "clearvec.c" +* +* ROUTINES FOR WILSON VECTORS +* +* void mult_mat_wilson_vec( su3_matrix *mat, wilson_vector *src, +* wilson_vector *dest ); +* file m_mat_wvec.c +* dest <- mat*src +* void mult_su3_mat_hwvec( su3_matrix *mat, half_wilson_vector *src, +* half_wilson_vector *dest ); +* file m_mat_hwvec.c +* dest <- mat*src +* void mult_adj_mat_wilson_vec( su3_matrix *mat, wilson_vector *src, +* wilson_vector *dest) +* file m_amat_wvec.c +* dest <- mat_adjoint*src +* void mult_adj_su3_mat_hwvec su3_matrix *mat, +* half_wilson_vector *src, half_wilson_vector *dest ) +* file m_amat_hwvec.c +* dest <- mat_adjoint*src +* +* void add_wilson_vector( wilson_vector *src1, wilson_vector *src2, +* wilson_vector *dest ); +* file add_wvec.c +* dest <- src1+src2 +* void sub_wilson_vector( wilson_vector *src1, wilson_vector *src2, +* wilson_vector *dest ); +* file sub_wvec.c +* dest <- src1-src2 +* +* void scalar_mult_wvec wilson_vector *src, radix s, wilson_vector *dest ) +* file s_m_wvec.c +* dest <- s*src +* void scalar_mult_hwvec( half_wilson_vector *src, radix s, +* half_wilson_vector *dest) +* file s_m_hwvec.c +* dest <- s*src +* radix magsq_wvec( wilson_vector *src ); +* file msq_wvec.c +* s <- squared magnitude of src +* complex wvec_dot( wilson_vector *src1, wilson_vector *src2 ); +* file wvec_dot.c +* c <- dot product of src1 and src2 +* complex wvec2_dot( wilson_vector *src1, wilson_vector *src2 ); +* file wvec2_dot.c +* c <- dot product of src1 and src2, Used only in Claude's +* mrilu.c, I don't know what the difference is. DT. +* radix wvec_rdot( wilson_vector *a, wilson_vector *b ) +* wilson_vector *a,*b; +* file "wvec_rdot.c", "wvec_rdot.m4" +* r <- real part of dot product of src1 and src2 +* void scalar_mult_add_wvec( wilson_vector *src1,*src2, radix s, +* wilson_vector *dest) +* file s_m_a_wvec.c +* dest <- src1 + s*src2 +* void scalar_mult_addtm_wvec( wilson_vector *src1,*src2, radix s, +* wilson_vector *dest) +* file s_m_atm_wvec.c +* dest <- -src1 + s*src2 ("atm"="add to minus") +* void c_scalar_mult_add_wvec( wilson_vector *v1, wilson_vector *v2, +* complex *phase, wilson_vector *v3) +* file "cs_m_a_wvec.c" +* void c_scalar_mult_add_wvec2( wilson_vector *v1, wilson_vector *v2, +* complex scalar, wilson_vector *v3) +* file "cs_m_a_wvec2.c" +* differs from previous one: value of scalar, not address is arg. +* void wp_shrink( wilson_vector *src, half_wilson_vector *dest, +* int dir, int sign ); +* file wp_shrink.c , wp_shrink.m4 +* if(dir = [XYZT]UP) dest <- components of src along eigenvectors +* of gamma_dir with eigenvalue +1 +* if(dir = [XYZT]DOWN) dest <- components of src along eigenvectors +* of gamma_dir with eigenvalue -1 +* if(sign==MINUS)switch roles of +1 and -1 +* void wp_shrink_4dir( wilson_vector *a, half_wilson_vector *b1, +* half_wilson_vector *b2, half_wilson_vector *b3, +* half_wilson_vector *b4, int sign ); +* file wp_shrink4.c wp_shrink4.m4 +* Shrink A in X,Y,Z,T directions respectively, results in B1-B4 +* void wp_grow( half_wilson_vector *src, wilson_vector *dest, +* int dir, int sign ); +* file wp_grow.c , wp_grow.m4 +* if(dir = [XYZT]UP) dest <- components of src times eigenvectors +* of gamma_dir with eigenvalue +1 +* if(dir = [XYZT]DOWN) dest <- components of src times eigenvectors +* of gamma_dir with eigenvalue -1 +* if(sign==MINUS)switch roles of +1 and -1 +* Note: wp_shrink( +-dir) followed by wp_grow( +-dir) amounts to +* multiplication by 1+-gamma_dir, or 1-+gamma_dir if sign=MINUS +* void wp_grow_add( half_wilson_vector *src, wilson_vector *dest, +* int dir, int sign ); +* file wp_grow_a.c , wp_grow_a.m4 +* wp_grow, and add result to previous contents of dest. +* void grow_add_four_wvecs( wilson_vector *a, half_wilson_vector *b1, +* half_wilson_vector *b2, half_wilson_vector *b3, +* half_wilson_vector *b4, int sign, int sum ); +* file grow4wvecs.c grow4wvecs.m4 +* If sum==0 +* Grow b1-b4 in X,Y,Z,T directions respectively, sum of results in A +* If sum==1 +* Grow b1-b4 in X,Y,Z,T directions respectively, add to current A +* +* void mult_by_gamma( wilson_vector *src, wilson_vector *dest, int dir ); +* file mb_gamma.c +* dest <- gamma[dir] * src, dir=[XYZT]UP,GAMMAFIVE +* void mult_by_gamma_left( wilson_matrix *src, wilson_matrix *dest, int dir ); +* file mb_gamma_l.c +* dest <- gamma[dir] * src, dir=[XYZT]UP,GAMMAFIVE +* acts on first index of matrix +* void mult_by_gamma_right( wilson_matrix *src, wilson_matrix *dest, int dir ); +* file mb_gamma_r.c +* dest_ij <- gamma[dir]_ik * src_jk, dir=[XYZT]UP,GAMMAFIVE +* acts on second index of matrix +* +* void su3_projector_w( wilson_vector *a, wilson_vector *b, su3_matrix *c ) +* sum over spins of outer product of A.d[s] and B.d[s] - a three +* by three complex matrix +* file "su3_proj_w.c" +* void clear_wvec( wilson_vector *dest ); +* file clear_wvec.c +* dest <- 0.0 +* void copy_wvec( wilson_vector *src, wilson_vector *dest ); +* file copy_wvec.c +* dest <- src +* void dump_wilson_vec( wilson_vector *src ); +* file dump_wvec.c +* print out a wilson vector +* +* MISCELLANEOUS ROUTINES +* +* radix gaussian_rand_no( void *prn_pt ) +* void *prn_pt; ( passed to myrand()) +* file "gaussrand.c" +* +*/ + +int prefetch_matrix(su3_matrix *); +int prefetch_vector(su3_vector *); +int prefetch_adjoint(adjoint_matrix *); + +void mult_su3_nn ( su3_matrix *a, su3_matrix *b, su3_matrix *c ); +void mult_su3_na ( su3_matrix *a, su3_matrix *b, su3_matrix *c ); +void mult_su3_an ( su3_matrix *a, su3_matrix *b, su3_matrix *c ); +radix realtrace_su3( su3_matrix *a, su3_matrix *b ); +complex trace_su3( su3_matrix *a ); +complex complextrace_su3( su3_matrix *a, su3_matrix *b ); +complex det_su3( su3_matrix *a ); +void add_su3_matrix( su3_matrix *a, su3_matrix *b, su3_matrix *c ); +void sub_su3_matrix( su3_matrix *a, su3_matrix *b, su3_matrix *c ); +void scalar_mult_su3_matrix( su3_matrix *src, radix scalar, su3_matrix *dest); +void scalar_mult_add_su3_matrix( su3_matrix *src1, su3_matrix *src2, + radix scalar, su3_matrix *dest); +void scalar_mult_sub_su3_matrix( su3_matrix *src1, su3_matrix *src2, + radix scalar, su3_matrix *dest); +void c_scalar_mult_su3mat( su3_matrix *src, complex *scalar, + su3_matrix *dest); +void c_scalar_mult_add_su3mat( su3_matrix *src1, su3_matrix *src2, + complex *scalar, su3_matrix *dest); +void c_scalar_mult_sub_su3mat( su3_matrix *src1, su3_matrix *src2, + complex *scalar, su3_matrix *dest); +void su3_adjoint( su3_matrix *a, su3_matrix *b ); +void make_anti_hermitian( su3_matrix *m3, anti_hermitmat *ah3 ); +void random_anti_hermitian( anti_hermitmat *mat_antihermit, void *prn_pt ); +void uncompress_anti_hermitian( anti_hermitmat *mat_anti, su3_matrix *mat ); +void compress_anti_hermitian( su3_matrix *mat, anti_hermitmat *mat_anti); +void clear_su3mat( su3_matrix *dest ); +void su3mat_copy( su3_matrix *a, su3_matrix *b ); +void dumpmat( su3_matrix *m ); + +void su3_projector( su3_vector *a, su3_vector *b, su3_matrix *c ); +complex su3_dot( su3_vector *a, su3_vector *b ); +radix su3_rdot( su3_vector *a, su3_vector *b ); +radix magsq_su3vec( su3_vector *a ); +void su3vec_copy( su3_vector *a, su3_vector *b ); +void dumpvec( su3_vector *v ); +void clearvec( su3_vector *v ); + +void mult_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c ); +void mult_su3_mat_vec_sum( su3_matrix *a, su3_vector *b, su3_vector *c ); +void mult_su3_mat_vec_sum_4dir( su3_matrix *a, su3_vector *b0, + su3_vector *b1, su3_vector *b2, su3_vector *b3, su3_vector *c ); +void mult_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b, su3_vector *c ); +void mult_adj_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c ); +void mult_adj_su3_mat_vec_4dir( su3_matrix *a, su3_vector *b, su3_vector *c ); +void mult_adj_su3_mat_vec_sum( su3_matrix *a, su3_vector *b, su3_vector *c ); +void mult_adj_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b, su3_vector *c ); + +void add_su3_vector( su3_vector *a, su3_vector *b, su3_vector *c ); +void sub_su3_vector( su3_vector *a, su3_vector *b, su3_vector *c ); +void sub_four_su3_vecs( su3_vector *a, su3_vector *b1, su3_vector *b2, + su3_vector *b3, su3_vector *b4 ); + +void scalar_mult_su3_vector( su3_vector *src, radix scalar, + su3_vector *dest); +void scalar_mult_add_su3_vector( su3_vector *src1, su3_vector *src2, + radix scalar, su3_vector *dest); +void scalar_mult_sum_su3_vector( su3_vector *src1, su3_vector *src2, + radix scalar); +void scalar_mult_sub_su3_vector( su3_vector *src1, su3_vector *src2, + radix scalar, su3_vector *dest); +void scalar_mult_wvec( wilson_vector *src, radix s, wilson_vector *dest ); +void scalar_mult_hwvec( half_wilson_vector *src, radix s, + half_wilson_vector *dest ); +void scalar_mult_add_wvec( wilson_vector *src1, wilson_vector *src2, + radix scalar, wilson_vector *dest ); +void scalar_mult_addtm_wvec( wilson_vector *src1, wilson_vector *src2, + radix scalar, wilson_vector *dest ); +void c_scalar_mult_add_wvec(wilson_vector *src1, wilson_vector *src2, + complex *phase, wilson_vector *dest ); +void c_scalar_mult_add_wvec2(wilson_vector *src1, wilson_vector *src2, + complex s, wilson_vector *dest ); +void c_scalar_mult_su3vec( su3_vector *src, complex *phase, su3_vector *dest ); +void c_scalar_mult_add_su3vec(su3_vector *v1, complex *phase, su3_vector *v2); +void c_scalar_mult_sub_su3vec(su3_vector *v1, complex *phase, su3_vector *v2); + +void mult_mat_wilson_vec( su3_matrix *mat, wilson_vector *src, + wilson_vector *dest ); +void mult_su3_mat_hwvec( su3_matrix *mat, half_wilson_vector *src, + half_wilson_vector *dest ); +void mult_adj_mat_wilson_vec( su3_matrix *mat, wilson_vector *src, + wilson_vector *dest); +void mult_adj_su3_mat_hwvec( su3_matrix *mat, half_wilson_vector *src, + half_wilson_vector *dest ); + +void add_wilson_vector( wilson_vector *src1, wilson_vector *src2, + wilson_vector *dest ); +void sub_wilson_vector( wilson_vector *src1, wilson_vector *src2, + wilson_vector *dest ); +radix magsq_wvec( wilson_vector *src ); +complex wvec_dot( wilson_vector *src1, wilson_vector *src2 ); +complex wvec2_dot( wilson_vector *src1, wilson_vector *src2 ); +radix wvec_rdot( wilson_vector *a, wilson_vector *b ); + +void wp_shrink( wilson_vector *src, half_wilson_vector *dest, + int dir, int sign ); +void wp_shrink_4dir( wilson_vector *a, half_wilson_vector *b1, + half_wilson_vector *b2, half_wilson_vector *b3, + half_wilson_vector *b4, int sign ); +void wp_grow( half_wilson_vector *src, wilson_vector *dest, + int dir, int sign ); +void wp_grow_add( half_wilson_vector *src, wilson_vector *dest, + int dir, int sign ); +void grow_add_four_wvecs( wilson_vector *a, half_wilson_vector *b1, + half_wilson_vector *b2, half_wilson_vector *b3, + half_wilson_vector *b4, int sign, int sum ); +void mult_by_gamma( wilson_vector *src, wilson_vector *dest, int dir ); +void mult_by_gamma_left( wilson_matrix *src, wilson_matrix *dest, int dir ); +void mult_by_gamma_right( wilson_matrix *src, wilson_matrix *dest, int dir ); + +void su3_projector_w( wilson_vector *a, wilson_vector *b, su3_matrix *c ); +void clear_wvec( wilson_vector *dest ); +void copy_wvec( wilson_vector *src, wilson_vector *dest ); +void dump_wilson_vec( wilson_vector *src ); + +radix gaussian_rand_no( void *prn_pt ); + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_adjoint.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_adjoint.c new file mode 100644 index 0000000000000000000000000000000000000000..c66a07729300be479fa19769734bb3e617bf54c0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_adjoint.c @@ -0,0 +1,15 @@ +/****************** su3_adjoint.c (in su3.a) ************************** +* * +* void su3_adjoint( su3_matrix *a, su3_matrix *b ) * +* B <- A_adjoint, adjoint of an SU3 matrix * +*/ +#include "complex.h" +#include "su3.h" + +/* adjoint of an SU3 matrix */ +void su3_adjoint( su3_matrix *a, su3_matrix *b ){ +register int i,j; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + CONJG( a->e[j][i], b->e[i][j] ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_dot.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_dot.c new file mode 100644 index 0000000000000000000000000000000000000000..7e95ade7774072a323c09a09f530dfc478f15f99 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_dot.c @@ -0,0 +1,45 @@ +/****************** su3_dot.c (in su3.a) ****************************** +* * +* complex su3_dot( su3_vector *a, su3_vector *b ) * +* return dot product of two su3_vectors * +*/ +#include "complex.h" +#include "su3.h" + +complex su3_dot( su3_vector *a, su3_vector *b ){ + +#ifndef NATIVEDOUBLE +complex temp1,temp2; + CMULJ_(a->c[0],b->c[0],temp1) + CMULJ_(a->c[1],b->c[1],temp2) + CSUM(temp1,temp2); + CMULJ_(a->c[2],b->c[2],temp2) + CSUM(temp1,temp2); + return(temp1); + +#else /* RS6000 version */ + + register double ar,ai,br,bi,cr,ci; + register complex cc; + + ar=a->c[0].real; ai=a->c[0].imag; + br=b->c[0].real; bi=b->c[0].imag; + cr = ar*br + ai*bi; + ci = ar*bi - ai*br; + + ar=a->c[1].real; ai=a->c[1].imag; + br=b->c[1].real; bi=b->c[1].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + + ar=a->c[2].real; ai=a->c[2].imag; + br=b->c[2].real; bi=b->c[2].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + + cc.real = cr; + cc.imag = ci; + return(cc); + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_proj.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_proj.c new file mode 100644 index 0000000000000000000000000000000000000000..31f2aba29ed53f10685dc9a80bcea381bba9ced6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_proj.c @@ -0,0 +1,50 @@ +/***************** su3_proj.c (in su3.a) ****************************** +* * +* void su3_projector( su3_vector *a, su3_vector *b, su3_matrix *c ) * +* C <- outer product of A and B * +* C_ij = A_i * B_adjoint_j * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +void su3_projector( su3_vector *a, su3_vector *b, su3_matrix *c ){ +register int i,j; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + CMUL_J( a->c[i], b->c[j], c->e[i][j] ); + } +} + +#else +#ifdef NATIVEDOUBLE /* RS6000 version */ + +void su3_projector( su3_vector *a, su3_vector *b, su3_matrix *c ){ + + register int i,j; + register double ar,ai,br,bi; + + for(i=0;i<3;i++){ + ar=a->c[i].real; ai=a->c[i].imag; + for(j=0;j<3;j++){ + br=b->c[j].real; bi=b->c[j].imag; + c->e[i][j].real = ar*br + ai*bi; + c->e[i][j].imag = ai*br - ar*bi; + } + } +} +#else + +void su3_projector( su3_vector *a, su3_vector *b, su3_matrix *c ){ +register int i,j; +register radix tmp,tmp2; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + tmp2 = a->c[i].real * b->c[j].real; + tmp = a->c[i].imag * b->c[j].imag; + c->e[i][j].real = tmp + tmp2; + tmp2 = a->c[i].real * b->c[j].imag; + tmp = a->c[i].imag * b->c[j].real; + c->e[i][j].imag = tmp - tmp2; + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* end ifdef FAST */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_proj_w.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_proj_w.c new file mode 100644 index 0000000000000000000000000000000000000000..3a4bb0f5b6c13c2db3ac5840e0b10269d6af36fc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_proj_w.c @@ -0,0 +1,72 @@ +/***************** su3_projector_w.c (in su3.a) ****************************** +* * +* void su3_projector_w( wilson_vector *a, wilson_vector *b, su3_matrix *c ) +* C <- sum over spins of outer product of A.d[i] and B.d[i] * +* C_ij = sum( A_i * B_adjoint_j ) * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +void su3_projector_w( wilson_vector *a, wilson_vector *b, su3_matrix *c ){ +register int i,j,k; +register complex cc; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + c->e[i][j] = cmplx(0.0,0.0); + for(k=0;k<4;k++){ + CMUL_J( a->d[k].c[i], b->d[k].c[j], cc ); CSUM( c->e[i][j], cc ); + } + } +} + +#else +#ifdef NATIVEDOUBLE /* RS6000 version */ + +void su3_projector_w( wilson_vector *a, wilson_vector *b, su3_matrix *c ){ + register int i,j; + register double ar,ai,br,bi,cr,ci; + + for(i=0;i<3;i++)for(j=0;j<3;j++){ + ar=a->d[0].c[i].real; ai=a->d[0].c[i].imag; + br=b->d[0].c[j].real; bi=b->d[0].c[j].imag; + cr = ar*br + ai*bi; + ci = ai*br - ar*bi; + + ar=a->d[1].c[i].real; ai=a->d[1].c[i].imag; + br=b->d[1].c[j].real; bi=b->d[1].c[j].imag; + cr += ar*br + ai*bi; + ci += ai*br - ar*bi; + + ar=a->d[2].c[i].real; ai=a->d[2].c[i].imag; + br=b->d[2].c[j].real; bi=b->d[2].c[j].imag; + cr += ar*br + ai*bi; + ci += ai*br - ar*bi; + + ar=a->d[3].c[i].real; ai=a->d[3].c[i].imag; + br=b->d[3].c[j].real; bi=b->d[3].c[j].imag; + cr += ar*br + ai*bi; + ci += ai*br - ar*bi; + + c->e[i][j].real = cr; + c->e[i][j].imag = ci; + } +} +#else +void su3_projector_w( wilson_vector *a, wilson_vector *b, su3_matrix *c ){ +register int i,j,k; +register radix tmp_r,tmp_i,tmp2; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + tmp_r = tmp_i = 0.0; + for(k=0;k<4;k++){ + tmp2 = a->d[k].c[i].real * b->d[k].c[j].real; tmp_r = tmp_r + tmp2; + tmp2 = a->d[k].c[i].imag * b->d[k].c[j].imag; tmp_r = tmp_r + tmp2; + tmp2 = a->d[k].c[i].imag * b->d[k].c[j].real; tmp_i = tmp_i + tmp2; + tmp2 = a->d[k].c[i].real * b->d[k].c[j].imag; tmp_i = tmp_i - tmp2; + } + + c->e[i][j].real = tmp_r; + c->e[i][j].imag = tmp_i; + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* end ifdef FAST */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_rdot.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_rdot.c new file mode 100644 index 0000000000000000000000000000000000000000..cdacd825d94456113daf3fbeb341e0b5a9120cd9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_rdot.c @@ -0,0 +1,40 @@ +/***************** su3_rdot.c (in su3.a) ****************************** +* * +* radix su3_rdot( su3_vector *a, su3_vector *b ) * +* return real part of dot product of two su3_vectors * +*/ +#include "complex.h" +#include "su3.h" + +radix su3_rdot( su3_vector *a, su3_vector *b ){ + +#ifndef NATIVEDOUBLE +register radix temp1,temp2; + temp2 = a->c[0].real * b->c[0].real; + temp1 = a->c[0].imag * b->c[0].imag; temp2 += temp1; + temp1 = a->c[1].real * b->c[1].real; temp2 += temp1; + temp1 = a->c[1].imag * b->c[1].imag; temp2 += temp1; + temp1 = a->c[2].real * b->c[2].real; temp2 += temp1; + temp1 = a->c[2].imag * b->c[2].imag; temp2 += temp1; + return(temp2); + +#else /* RS6000 version */ + + register double ar,ai,br,bi,ss; + + ar=a->c[0].real; ai=a->c[0].imag; + br=b->c[0].real; bi=b->c[0].imag; + ss = ar*br + ai*bi; + + ar=a->c[1].real; ai=a->c[1].imag; + br=b->c[1].real; bi=b->c[1].imag; + ss += ar*br + ai*bi; + + ar=a->c[2].real; ai=a->c[2].imag; + br=b->c[2].real; bi=b->c[2].imag; + ss += ar*br + ai*bi; + + return(ss); + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3mat_copy.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3mat_copy.c new file mode 100644 index 0000000000000000000000000000000000000000..fb4c14c62175afd17403d9a0307f5a6699bc753d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3mat_copy.c @@ -0,0 +1,16 @@ +/***************** su3mat_copy.c (in su3.a) *************************** +* * +* void su3mat_copy( su3_matrix *a, su3_matrix *b ) * +* Copy an su3 matrix: B <- A * +*/ +#include "complex.h" +#include "su3.h" + +/* Copy a su3 matrix: b <- a */ +void su3mat_copy( su3_matrix *a, su3_matrix *b ){ +register int i,j; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + b->e[i][j].real = a->e[i][j].real; + b->e[i][j].imag = a->e[i][j].imag; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3vec_copy.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3vec_copy.c new file mode 100644 index 0000000000000000000000000000000000000000..86d08e340387b7b058377ce78096f5fbe4f672b9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3vec_copy.c @@ -0,0 +1,16 @@ +/***************** su3vec_copy.c (in su3.a) *************************** +* * +* void su3vec_copy( su3_vector *a, su3_vector *b ) * +* Copy an su3 vector: B <- A * +*/ +#include "complex.h" +#include "su3.h" + +/* Copy a su3 vector: b <- a */ +void su3vec_copy( su3_vector *a, su3_vector *b ){ +register int i; + for(i=0;i<3;i++){ + b->c[i].real = a->c[i].real; + b->c[i].imag = a->c[i].imag; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/sub4vecs.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/sub4vecs.c new file mode 100644 index 0000000000000000000000000000000000000000..121c42657d654967d2c454f1de1f075ba51c584f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/sub4vecs.c @@ -0,0 +1,38 @@ +/***************** sub4vecs.c (in su3.a) ****************************** +* * +* Subtract four su3_vectors from an su3_vector * +* void sub_four_su3_vecs( su3_vector *a,*b1,*b2,*b3,*b4) * +* A <- A - B1 - B2 - B3 - B4 * +*/ +#include "complex.h" +#include "su3.h" + +/* subtract four su3 vectors */ +#ifndef FAST +void sub_four_su3_vecs( su3_vector *a, su3_vector *b1, su3_vector *b2, + su3_vector *b3, su3_vector *b4 ){ +register int i; + for(i=0;i<3;i++){ + CSUB( a->c[i], b1->c[i], a->c[i] ); + CSUB( a->c[i], b2->c[i], a->c[i] ); + CSUB( a->c[i], b3->c[i], a->c[i] ); + CSUB( a->c[i], b4->c[i], a->c[i] ); + } +} +#else +void sub_four_su3_vecs( su3_vector *a, su3_vector *b1, su3_vector *b2, + su3_vector *b3, su3_vector *b4 ){ + CSUB( a->c[0], b1->c[0], a->c[0] ); + CSUB( a->c[1], b1->c[1], a->c[1] ); + CSUB( a->c[2], b1->c[2], a->c[2] ); + CSUB( a->c[0], b2->c[0], a->c[0] ); + CSUB( a->c[1], b2->c[1], a->c[1] ); + CSUB( a->c[2], b2->c[2], a->c[2] ); + CSUB( a->c[0], b3->c[0], a->c[0] ); + CSUB( a->c[1], b3->c[1], a->c[1] ); + CSUB( a->c[2], b3->c[2], a->c[2] ); + CSUB( a->c[0], b4->c[0], a->c[0] ); + CSUB( a->c[1], b4->c[1], a->c[1] ); + CSUB( a->c[2], b4->c[2], a->c[2] ); +} +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/sub_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/sub_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..1ecd3e4289a1c986c219a6f1c9e476876fe2d151 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/sub_wvec.c @@ -0,0 +1,14 @@ +/******************** sub_wvec.c (in su3.a) ******************** +* +*void sub_wilson_vector(wilson_vector *src1,*src2,*dest) +* sub two Wilson vectors +* dest <- src1 + src2 +*/ +#include "complex.h" +#include "su3.h" + +void sub_wilson_vector( wilson_vector *src1, wilson_vector *src2, + wilson_vector *dest ){ + register int i; + for(i=0;i<4;i++)sub_su3_vector( &(src1->d[i]), &(src2->d[i]), &(dest->d[i])); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/submat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/submat.c new file mode 100644 index 0000000000000000000000000000000000000000..94c52b4e2d32fcd5246bfb5a6c7a0ffdcaaf6fa7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/submat.c @@ -0,0 +1,15 @@ +/******************* submat.c (in su3.a) ****************************** +* * +* void sub_su3_matrix(a,b,c) su3_matrix *a,*b,*c; * +* subtract su3 matrices: C <- A - B * +*/ +#include "complex.h" +#include "su3.h" + +/* subtract su3 matrices */ +void sub_su3_matrix( su3_matrix *a, su3_matrix *b, su3_matrix *c ) { +register int i,j; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + CSUB( a->e[i][j], b->e[i][j], c->e[i][j] ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/subvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/subvec.c new file mode 100644 index 0000000000000000000000000000000000000000..ef5456bba2fc211455307abe142c160c27d2d54c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/subvec.c @@ -0,0 +1,15 @@ +/********************* subvec.c (in su3.a) **************************** +* * +* void sub_su3_vector(a,b,c) su3_vector *a,*b,*c; * +* subtract su3 vectors: C <- A - B * +*/ +#include "complex.h" +#include "su3.h" + +/* subtract su3 vectors */ +void sub_su3_vector( su3_vector *a, su3_vector *b, su3_vector *c ){ +register int i; + for(i=0;i<3;i++){ + CSUB( a->c[i], b->c[i], c->c[i] ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/trace_su3.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/trace_su3.c new file mode 100644 index 0000000000000000000000000000000000000000..e655a5d09a2976949b8472cbc91f3c1b7bb6d102 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/trace_su3.c @@ -0,0 +1,15 @@ +/******************* trace_su3.c (in su3.a) *************************** +* * +* complex trace_su3(a) su3_matrix *a; * +* return complex trace of an SU3 matrix * +*/ +#include "complex.h" +#include "su3.h" + +/* Complex trace of an SU3 matrix */ +complex trace_su3( su3_matrix *a ) { +register complex t1,t2; + CADD(a->e[0][0],a->e[1][1],t1); + CADD(t1,a->e[2][2],t2); + return(t2); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/uncmp_ahmat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/uncmp_ahmat.c new file mode 100644 index 0000000000000000000000000000000000000000..1205508fc4836cc5ba93592b87e6465fd31c4acf --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/uncmp_ahmat.c @@ -0,0 +1,35 @@ +/************ uncmp_ahmat.c (in su3.a) ******************************** +* * +* void uncompress_anti_hermitian( anti_hermitmat *mat_antihermit, * +* su3_matrix *mat_su3 ) * +* uncompresses an anti_hermitian matrix to make a 3x3 complex matrix * +*/ +#include "complex.h" +#include "su3.h" + +void uncompress_anti_hermitian( anti_hermitmat *mat_antihermit, + su3_matrix *mat_su3 ) { +/* uncompresses an anti_hermitian su3 matrix */ + radix temp1; + mat_su3->e[0][0].imag=mat_antihermit->m00im; + mat_su3->e[0][0].real=0.; + mat_su3->e[1][1].imag=mat_antihermit->m11im; + mat_su3->e[1][1].real=0.; + mat_su3->e[2][2].imag=mat_antihermit->m22im; + mat_su3->e[2][2].real=0.; + mat_su3->e[0][1].imag=mat_antihermit->m01.imag; + temp1=mat_antihermit->m01.real; + mat_su3->e[0][1].real=temp1; + mat_su3->e[1][0].real= -temp1; + mat_su3->e[1][0].imag=mat_antihermit->m01.imag; + mat_su3->e[0][2].imag=mat_antihermit->m02.imag; + temp1=mat_antihermit->m02.real; + mat_su3->e[0][2].real=temp1; + mat_su3->e[2][0].real= -temp1; + mat_su3->e[2][0].imag=mat_antihermit->m02.imag; + mat_su3->e[1][2].imag=mat_antihermit->m12.imag; + temp1=mat_antihermit->m12.real; + mat_su3->e[1][2].real=temp1; + mat_su3->e[2][1].real= -temp1; + mat_su3->e[2][1].imag=mat_antihermit->m12.imag; +}/*uncompress_anti_hermitian*/ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_grow.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_grow.c new file mode 100644 index 0000000000000000000000000000000000000000..cfe5f1ca3ed6b58c23f46c630fb9474b65b82764 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_grow.c @@ -0,0 +1,159 @@ +/***************** wp_grow.c (in su3.a) **************************/ +/* + Expand the "Wilson projection" of a Wilson fermion vector. + (1 +- gamma_j) is a projection operator, and we are given a + half_wilson_vector which contains the two components of a Wilson + vector projected out. This routine reexpands it to a four component + object. + + usage: wp_grow( half_wilson_vector *src, wilson_vector *dest, + int dir, int sign ); + + If dir is one of XUP,YUP,ZUP or TUP, the projection is + along the eigenvectors with eigenvalue +1, which survive + multiplcation by (1+gamma[dir]). + If dir is one of XDOWN,YDOWN,ZDOWN or TDOWN, the projection is + along the eigenvectors with eigenvalue -1, which survive + multiplication by (1-gamma[OPP_DIR(dir)]). + If sign=MINUS reverse the roles of +1 and -1 - in other words + use -gamma_dir instead of gamma_dir + + Here my eigenvectors are normalized to 2, so for XYZT directions + I won't explicitely multiply by 2. In other words, the matrix of + eigenvectors is sqrt(2) times a unitary matrix, and in reexpanding + the vector I will multiply by the adjoint of this matrix. + + For UP directions, hvec.h[0] and hvec.h[2] contain the projections + along the first and second eigenvectors respectively. + For DOWN directions, hvec.h[0] and hvec.h[2] contain the projections + along the third and fourth eigenvectors respectively. This results + in down directions differing from up directions only in the sign of + the addition. + + Note: wp_shrink( +-dir) followed by wp_grow( +-dir) amounts to multiplication + by 1+-gamma_dir + + gamma(XUP) eigenvectors eigenvalue + 0 0 0 i ( 1, 0, 0,-i) +1 + 0 0 i 0 ( 0, 1,-i, 0) +1 + 0 -i 0 0 ( 0, 1, 0,+i) -1 + -i 0 0 0 ( 1, 0,+i, 0) -1 + + gamma(YUP) eigenvectors eigenvalue + 0 0 0 -1 ( 1, 0, 0,-1) +1 + 0 0 1 0 ( 0, 1, 1, 0) +1 + 0 1 0 0 ( 1, 0, 0, 1) -1 + -1 0 0 0 ( 0, 1,-1, 0) -1 + + gamma(ZUP) eigenvectors eigenvalue + 0 0 i 0 ( 1, 0,-i, 0) +1 + 0 0 0 -i ( 0, 1, 0,+i) +1 + -i 0 0 0 ( 1, 0,+i, 0) -1 + 0 i 0 0 ( 0, 1, 0,-i) -1 + + gamma(TUP) eigenvectors eigenvalue + 0 0 1 0 ( 1, 0, 1, 0) +1 + 0 0 0 1 ( 0, 1, 0, 1) +1 + 1 0 0 0 ( 1, 0,-1, 0) -1 + 0 1 0 0 ( 0, 1, 0,-1) -1 + + gamma(FIVE) eigenvectors eigenvalue + 1 0 0 0 + 0 1 0 0 + 0 0 -1 0 + 0 0 0 -1 +*/ +#include +#include "complex.h" +#include "su3.h" +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 7 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define TUP 3 +#define TDOWN 4 +#define ZDOWN 5 +#define YDOWN 6 +#define XDOWN 7 + +#define OPP_DIR(dir) (7-(dir)) /* Opposite direction */ +#define NDIRS 8 /* number of directions */ + +void wp_grow( half_wilson_vector *src, wilson_vector *dest, + int dir, int sign ){ + register int i; /*color*/ + + if(sign==MINUS)dir=OPP_DIR(dir); /* two ways to get -gamma_dir ! */ + switch(dir){ + case XUP: + for(i=0;i<3;i++){ + dest->d[0].c[i] = src->h[0].c[i]; + dest->d[1].c[i] = src->h[1].c[i]; + TIMESMINUSI( src->h[0].c[i], dest->d[3].c[i]); + TIMESMINUSI( src->h[1].c[i], dest->d[2].c[i]); + } + break; + case XDOWN: + for(i=0;i<3;i++){ + dest->d[0].c[i] = src->h[0].c[i]; + dest->d[1].c[i] = src->h[1].c[i]; + TIMESPLUSI( src->h[0].c[i], dest->d[3].c[i]); + TIMESPLUSI( src->h[1].c[i], dest->d[2].c[i]); + } + break; + case YUP: + for(i=0;i<3;i++){ + dest->d[0].c[i] = src->h[0].c[i]; + dest->d[1].c[i] = src->h[1].c[i]; + TIMESMINUSONE( src->h[0].c[i], dest->d[3].c[i] ); + TIMESPLUSONE( src->h[1].c[i], dest->d[2].c[i] ); + } + break; + case YDOWN: + for(i=0;i<3;i++){ + dest->d[0].c[i] = src->h[0].c[i]; + dest->d[1].c[i] = src->h[1].c[i]; + TIMESPLUSONE( src->h[0].c[i], dest->d[3].c[i] ); + TIMESMINUSONE( src->h[1].c[i], dest->d[2].c[i] ); + } + break; + case ZUP: + for(i=0;i<3;i++){ + dest->d[0].c[i] = src->h[0].c[i]; + dest->d[1].c[i] = src->h[1].c[i]; + TIMESMINUSI( src->h[0].c[i], dest->d[2].c[i] ); + TIMESPLUSI( src->h[1].c[i], dest->d[3].c[i] ); + } + break; + case ZDOWN: + for(i=0;i<3;i++){ + dest->d[0].c[i] = src->h[0].c[i]; + dest->d[1].c[i] = src->h[1].c[i]; + TIMESPLUSI( src->h[0].c[i], dest->d[2].c[i] ); + TIMESMINUSI( src->h[1].c[i], dest->d[3].c[i] ); + } + break; + case TUP: + for(i=0;i<3;i++){ + dest->d[0].c[i] = src->h[0].c[i]; + dest->d[1].c[i] = src->h[1].c[i]; + dest->d[2].c[i] = src->h[0].c[i]; + dest->d[3].c[i] = src->h[1].c[i]; + } + break; + case TDOWN: + for(i=0;i<3;i++){ + dest->d[0].c[i] = src->h[0].c[i]; + dest->d[1].c[i] = src->h[1].c[i]; + TIMESMINUSONE( src->h[0].c[i], dest->d[2].c[i] ); + TIMESMINUSONE( src->h[1].c[i], dest->d[3].c[i] ); + } + break; + default: + printf("BAD CALL TO WP_GROW()\n"); + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_grow_a.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_grow_a.c new file mode 100644 index 0000000000000000000000000000000000000000..bec3d9e5dee0e2c161b31d0ba1e737703a9708c1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_grow_a.c @@ -0,0 +1,162 @@ +/***************** wp_grow_a.c (in su3.a) **************************/ +/* + Expand the "Wilson projection" of a Wilson fermion vector. + (1 +- gamma_j) is a projection operator, and we are given a + half_wilson_vector which contains the two components of a Wilson + vector projected out. This routine reexpands it to a four component + object and adds it to another Wilson vector. + + usage: wp_grow_add( half_wilson_vector *src, wilson_vector *dest, + int dir, int sign ); + + If dir is one of XUP,YUP,ZUP or TUP, the projection is + along the eigenvectors with eigenvalue +1, which survive + multiplcation by (1+gamma[dir]). + If dir is one of XDOWN,YDOWN,ZDOWN or TDOWN, the projection is + along the eigenvectors with eigenvalue -1, which survive + multiplication by (1-gamma[OPP_DIR(dir)]). + If sign=MINUS reverse the roles of +1 and -1 - in other words + use -gamma_dir instead of gamma_dir + + Here my eigenvectors are normalized to 2, so for XYZT directions + I won't explicitely multiply by 2. In other words, the matrix of + eigenvectors is sqrt(2) times a unitary matrix, and in reexpanding + the vector I will multiply by the adjoint of this matrix. + + For UP directions, hvec.h[0] and hvec.h[2] contain the projections + along the first and second eigenvectors respectively. + For DOWN directions, hvec.h[0] and hvec.h[2] contain the projections + along the third and fourth eigenvectors respectively. This results + in down directions differing from up directions only in the sign of + the addition. + + Note: wp_shrink( +-dir) followed by wp_grow( +-dir) amounts to multiplication + by 1+-gamma_dir + + gamma(XUP) eigenvectors eigenvalue + 0 0 0 i ( 1, 0, 0,-i) +1 + 0 0 i 0 ( 0, 1,-i, 0) +1 + 0 -i 0 0 ( 0, 1, 0,+i) -1 + -i 0 0 0 ( 1, 0,+i ,0) -1 + + gamma(YUP) eigenvectors eigenvalue + 0 0 0 -1 ( 1, 0, 0,-1) +1 + 0 0 1 0 ( 0, 1, 1, 0) +1 + 0 1 0 0 ( 1, 0, 0, 1) -1 + -1 0 0 0 ( 0, 1,-1, 0) -1 + + gamma(ZUP) eigenvectors eigenvalue + 0 0 i 0 ( 1, 0,-i, 0) +1 + 0 0 0 -i ( 0, 1, 0,+i) +1 + -i 0 0 0 ( 1, 0,+i, 0) -1 + 0 i 0 0 ( 0, 1, 0,-i) -1 + + gamma(TUP) eigenvectors eigenvalue + 0 0 1 0 ( 1, 0, 1, 0) +1 + 0 0 0 1 ( 0, 1, 0, 1) +1 + 1 0 0 0 ( 1, 0,-1, 0) -1 + 0 1 0 0 ( 0, 1, 0,-1) -1 + + gamma(FIVE) eigenvectors eigenvalue + 1 0 0 0 + 0 1 0 0 + 0 0 -1 0 + 0 0 0 -1 +*/ +#include +#include "complex.h" +#include "su3.h" +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 7 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define TUP 3 +#define TDOWN 4 +#define ZDOWN 5 +#define YDOWN 6 +#define XDOWN 7 + +#define OPP_DIR(dir) (7-(dir)) /* Opposite direction */ +#define NDIRS 8 /* number of directions */ + +/* a += i*b, a += -i*b */ +#define CSUM_TPI(a,b) { (a).real -= (b).imag; (a).imag += (b).real; } +#define CSUM_TMI(a,b) { (a).real += (b).imag; (a).imag -= (b).real; } + +void wp_grow_add( half_wilson_vector *src, wilson_vector *dest, + int dir, int sign ){ + register int i; /*color*/ + + if(sign==MINUS)dir=OPP_DIR(dir); /* two ways to get -gamma_dir ! */ + switch(dir){ + case XUP: + for(i=0;i<3;i++){ + CSUM( dest->d[0].c[i], src->h[0].c[i]); + CSUM( dest->d[1].c[i], src->h[1].c[i]); + CSUM_TMI( dest->d[2].c[i], src->h[1].c[i] ); + CSUM_TMI( dest->d[3].c[i], src->h[0].c[i] ); + } + break; + case XDOWN: + for(i=0;i<3;i++){ + CSUM( dest->d[0].c[i], src->h[0].c[i]); + CSUM( dest->d[1].c[i], src->h[1].c[i]); + CSUM_TPI( dest->d[2].c[i], src->h[1].c[i] ); + CSUM_TPI( dest->d[3].c[i], src->h[0].c[i] ); + } + break; + case YUP: + for(i=0;i<3;i++){ + CSUM( dest->d[0].c[i], src->h[0].c[i]); + CSUM( dest->d[1].c[i], src->h[1].c[i]); + CSUM( dest->d[2].c[i], src->h[1].c[i]); + CSUB( dest->d[3].c[i], src->h[0].c[i], dest->d[3].c[i] ); + } + break; + case YDOWN: + for(i=0;i<3;i++){ + CSUM( dest->d[0].c[i], src->h[0].c[i]); + CSUM( dest->d[1].c[i], src->h[1].c[i]); + CSUB( dest->d[2].c[i], src->h[1].c[i], dest->d[2].c[i] ); + CSUM( dest->d[3].c[i], src->h[0].c[i]); + } + break; + case ZUP: + for(i=0;i<3;i++){ + CSUM( dest->d[0].c[i], src->h[0].c[i]); + CSUM( dest->d[1].c[i], src->h[1].c[i]); + CSUM_TMI( dest->d[2].c[i], src->h[0].c[i] ); + CSUM_TPI( dest->d[3].c[i], src->h[1].c[i] ); + } + break; + case ZDOWN: + for(i=0;i<3;i++){ + CSUM( dest->d[0].c[i], src->h[0].c[i]); + CSUM( dest->d[1].c[i], src->h[1].c[i]); + CSUM_TPI( dest->d[2].c[i], src->h[0].c[i] ); + CSUM_TMI( dest->d[3].c[i], src->h[1].c[i] ); + } + break; + case TUP: + for(i=0;i<3;i++){ + CSUM( dest->d[0].c[i], src->h[0].c[i]); + CSUM( dest->d[1].c[i], src->h[1].c[i]); + CSUM( dest->d[2].c[i], src->h[0].c[i]); + CSUM( dest->d[3].c[i], src->h[1].c[i]); + } + break; + case TDOWN: + for(i=0;i<3;i++){ + CSUM( dest->d[0].c[i], src->h[0].c[i]); + CSUM( dest->d[1].c[i], src->h[1].c[i]); + CSUB( dest->d[2].c[i], src->h[0].c[i], dest->d[2].c[i] ); + CSUB( dest->d[3].c[i], src->h[1].c[i], dest->d[3].c[i] ); + } + break; + default: + printf("BAD CALL TO WP_GROW()\n"); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_shrink.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_shrink.c new file mode 100644 index 0000000000000000000000000000000000000000..3fae32bfbc3174eb266cfe54a732aa227ef937fd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_shrink.c @@ -0,0 +1,159 @@ +/************* wp_shrink.c (in su3.a) **************************/ +/* + Compute the "Wilson projection" of a Wilson fermion vector. + (1 +- gamma_j) is a projection operator, and we want to isolate + the components of the vector that it keeps. In other words, keep + the components of the vector along the eigenvectors of 1+-gamma_j + with eigenvalue 2, and throw away those with eigenvalue 0. + + usage: wp_shrink( wilson_vector *src, half_wilson_vector *dest, + int dir, int sign ) + + If dir is one of XUP,YUP,ZUP or TUP, take the projections + along the eigenvectors with eigenvalue +1, which survive + multiplication by (1+gamma[dir]). + If dir is one of XDOWN,YDOWN,ZDOWN or TDOWN, take the projections + along the eigenvectors with eigenvalue -1, which survive + multiplication by (1-gamma[OPP_DIR(dir)]). + If sign=MINUS, switch the roles of +1 and -1 (ie use -gamma_dir + instead of gamma_dir ) + + Here my eigenvectors are normalized to 2, so for XYZT directions + I won't explicitely multiply by 2. In other words, the matrix of + eigenvectors is sqrt(2) times a unitary matrix, and in reexpanding + the vector I will multiply by the adjoint of this matrix. + + For UP directions, hvec.h[0] and hvec.h[2] contain the projections + along the first and second eigenvectors respectively. + For DOWN directions, hvec.h[0] and hvec.h[2] contain the projections + along the third and fourth eigenvectors respectively. This results + in down directions differing from up directions only in the sign of + the addition. + + Note: wp_shrink( +-dir) followed by wp_grow( +-dir) amounts to multiplication + by 1+-gamma_dir + + gamma(XUP) eigenvectors eigenvalue + 0 0 0 i ( 1, 0, 0,-i) +1 + 0 0 i 0 ( 0, 1,-i, 0) +1 + 0 -i 0 0 ( 0, 1, 0,+i) -1 + -i 0 0 0 ( 1, 0,+i, 0) -1 + + gamma(YUP) eigenvectors eigenvalue + 0 0 0 -1 ( 1, 0, 0,-1) +1 + 0 0 1 0 ( 0, 1, 1, 0) +1 + 0 1 0 0 ( 1, 0, 0, 1) -1 + -1 0 0 0 ( 0, 1,-1, 0) -1 + + gamma(ZUP) eigenvectors eigenvalue + 0 0 i 0 ( 1, 0,-i, 0) +1 + 0 0 0 -i ( 0, 1, 0,+i) +1 + -i 0 0 0 ( 1, 0,+i, 0) -1 + 0 i 0 0 ( 0, 1, 0,-i) -1 + + gamma(TUP) eigenvectors eigenvalue + 0 0 1 0 ( 1, 0, 1, 0) +1 + 0 0 0 1 ( 0, 1, 0, 1) +1 + 1 0 0 0 ( 1, 0,-1, 0) -1 + 0 1 0 0 ( 0, 1, 0,-1) -1 + + gamma(FIVE) eigenvectors eigenvalue + 1 0 0 0 + 0 1 0 0 + 0 0 -1 0 + 0 0 0 -1 +*/ +#include +#include "complex.h" +#include "su3.h" +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 7 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define TUP 3 +#define TDOWN 4 +#define ZDOWN 5 +#define YDOWN 6 +#define XDOWN 7 + +#define OPP_DIR(dir) (7-(dir)) /* Opposite direction */ +#define NDIRS 8 /* number of directions */ + +void wp_shrink( wilson_vector *src, half_wilson_vector *dest, + int dir, int sign ){ + register int i; /*color*/ + + if(sign==MINUS)dir=OPP_DIR(dir); /* two ways to get -gamma_dir ! */ + switch(dir){ + case XUP: + for(i=0;i<3;i++){ + dest->h[0].c[i].real = src->d[0].c[i].real - src->d[3].c[i].imag; + dest->h[0].c[i].imag = src->d[0].c[i].imag + src->d[3].c[i].real; + dest->h[1].c[i].real = src->d[1].c[i].real - src->d[2].c[i].imag; + dest->h[1].c[i].imag = src->d[1].c[i].imag + src->d[2].c[i].real; + } + break; + case XDOWN: + for(i=0;i<3;i++){ + dest->h[0].c[i].real = src->d[0].c[i].real + src->d[3].c[i].imag; + dest->h[0].c[i].imag = src->d[0].c[i].imag - src->d[3].c[i].real; + dest->h[1].c[i].real = src->d[1].c[i].real + src->d[2].c[i].imag; + dest->h[1].c[i].imag = src->d[1].c[i].imag - src->d[2].c[i].real; + } + break; + case YUP: + for(i=0;i<3;i++){ + dest->h[0].c[i].real = src->d[0].c[i].real - src->d[3].c[i].real; + dest->h[0].c[i].imag = src->d[0].c[i].imag - src->d[3].c[i].imag; + dest->h[1].c[i].real = src->d[1].c[i].real + src->d[2].c[i].real; + dest->h[1].c[i].imag = src->d[1].c[i].imag + src->d[2].c[i].imag; + } + break; + case YDOWN: + for(i=0;i<3;i++){ + dest->h[0].c[i].real = src->d[0].c[i].real + src->d[3].c[i].real; + dest->h[0].c[i].imag = src->d[0].c[i].imag + src->d[3].c[i].imag; + dest->h[1].c[i].real = src->d[1].c[i].real - src->d[2].c[i].real; + dest->h[1].c[i].imag = src->d[1].c[i].imag - src->d[2].c[i].imag; + } + break; + case ZUP: + for(i=0;i<3;i++){ + dest->h[0].c[i].real = src->d[0].c[i].real - src->d[2].c[i].imag; + dest->h[0].c[i].imag = src->d[0].c[i].imag + src->d[2].c[i].real; + dest->h[1].c[i].real = src->d[1].c[i].real + src->d[3].c[i].imag; + dest->h[1].c[i].imag = src->d[1].c[i].imag - src->d[3].c[i].real; + } + break; + case ZDOWN: + for(i=0;i<3;i++){ + dest->h[0].c[i].real = src->d[0].c[i].real + src->d[2].c[i].imag; + dest->h[0].c[i].imag = src->d[0].c[i].imag - src->d[2].c[i].real; + dest->h[1].c[i].real = src->d[1].c[i].real - src->d[3].c[i].imag; + dest->h[1].c[i].imag = src->d[1].c[i].imag + src->d[3].c[i].real; + } + break; + case TUP: + for(i=0;i<3;i++){ + dest->h[0].c[i].real = src->d[0].c[i].real + src->d[2].c[i].real; + dest->h[0].c[i].imag = src->d[0].c[i].imag + src->d[2].c[i].imag; + dest->h[1].c[i].real = src->d[1].c[i].real + src->d[3].c[i].real; + dest->h[1].c[i].imag = src->d[1].c[i].imag + src->d[3].c[i].imag; + } + break; + case TDOWN: + for(i=0;i<3;i++){ + dest->h[0].c[i].real = src->d[0].c[i].real - src->d[2].c[i].real; + dest->h[0].c[i].imag = src->d[0].c[i].imag - src->d[2].c[i].imag; + dest->h[1].c[i].real = src->d[1].c[i].real - src->d[3].c[i].real; + dest->h[1].c[i].imag = src->d[1].c[i].imag - src->d[3].c[i].imag; + } + break; + default: + printf("BAD CALL TO WP_SHRINK()\n"); + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_shrink4.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_shrink4.c new file mode 100644 index 0000000000000000000000000000000000000000..b560707cd200198294878934c51a10f7d6198981 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_shrink4.c @@ -0,0 +1,144 @@ +/***************** wp_shrink4.c (in su3.a) **************************** +* * +* Shrink a wilson vector in four directions, producing four * +* half_wilson_vectors. * +* void wp_shrink_4dir( wilson_vector *a, half_wilson_vector *b1, * +* half_wilson_vector *b2, half_wilson_vector *b3, * +* half_wilson_vector *b4, int sign ); * +* B1 <- (1 +- gamma_x)A,, projection * +* argument "sign" is sign of gamma matrix. * +* See wp_shrink.c for definitions of gamma matrices and eigenvectors. * +*/ +#include "complex.h" +#include "su3.h" +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 7 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define TUP 3 +#define TDOWN 4 +#define ZDOWN 5 +#define YDOWN 6 +#define XDOWN 7 + +#define OPP_DIR(dir) (7-(dir)) /* Opposite direction */ +#define NDIRS 8 /* number of directions */ + +#ifndef FAST /* "FAST", or IBM RS6000 version inlines calls */ + +void wp_shrink_4dir( wilson_vector *a, half_wilson_vector *b1, + half_wilson_vector *b2, half_wilson_vector *b3, + half_wilson_vector *b4, int sign ){ + wp_shrink( a,b1,XUP,sign); + wp_shrink( a,b2,YUP,sign); + wp_shrink( a,b3,ZUP,sign); + wp_shrink( a,b4,TUP,sign); +} + +#else /* "FAST" code inlines calls */ + +void wp_shrink_4dir( wilson_vector *a, half_wilson_vector *b1, + half_wilson_vector *b2, half_wilson_vector *b3, + half_wilson_vector *b4, int sign ){ + register int i; /*color*/ + +/* wp_shrink( a,b1,XUP,sign); */ + + if(sign==PLUS) + { + /* case XUP: */ + for(i=0;i<3;i++){ + b1->h[0].c[i].real = a->d[0].c[i].real - a->d[3].c[i].imag; + b1->h[0].c[i].imag = a->d[0].c[i].imag + a->d[3].c[i].real; + b1->h[1].c[i].real = a->d[1].c[i].real - a->d[2].c[i].imag; + b1->h[1].c[i].imag = a->d[1].c[i].imag + a->d[2].c[i].real; + } + } + else + { + /* case XDOWN: */ + for(i=0;i<3;i++){ + b1->h[0].c[i].real = a->d[0].c[i].real + a->d[3].c[i].imag; + b1->h[0].c[i].imag = a->d[0].c[i].imag - a->d[3].c[i].real; + b1->h[1].c[i].real = a->d[1].c[i].real + a->d[2].c[i].imag; + b1->h[1].c[i].imag = a->d[1].c[i].imag - a->d[2].c[i].real; + } + } + + + /* wp_shrink( a,b2,YUP,sign); */ + + if(sign==PLUS) + { + /* case YUP: */ + for(i=0;i<3;i++){ + b2->h[0].c[i].real = a->d[0].c[i].real - a->d[3].c[i].real; + b2->h[0].c[i].imag = a->d[0].c[i].imag - a->d[3].c[i].imag; + b2->h[1].c[i].real = a->d[1].c[i].real + a->d[2].c[i].real; + b2->h[1].c[i].imag = a->d[1].c[i].imag + a->d[2].c[i].imag; + } + + } + else + { + /* case YDOWN: */ + for(i=0;i<3;i++){ + b2->h[0].c[i].real = a->d[0].c[i].real + a->d[3].c[i].real; + b2->h[0].c[i].imag = a->d[0].c[i].imag + a->d[3].c[i].imag; + b2->h[1].c[i].real = a->d[1].c[i].real - a->d[2].c[i].real; + b2->h[1].c[i].imag = a->d[1].c[i].imag - a->d[2].c[i].imag; + } + } + + /* wp_shrink( a,b3,ZUP,sign); */ + + if(sign==PLUS) + { + /* case ZUP: */ + for(i=0;i<3;i++){ + b3->h[0].c[i].real = a->d[0].c[i].real - a->d[2].c[i].imag; + b3->h[0].c[i].imag = a->d[0].c[i].imag + a->d[2].c[i].real; + b3->h[1].c[i].real = a->d[1].c[i].real + a->d[3].c[i].imag; + b3->h[1].c[i].imag = a->d[1].c[i].imag - a->d[3].c[i].real; + } + } + else + { + /* case ZDOWN: */ + for(i=0;i<3;i++){ + b3->h[0].c[i].real = a->d[0].c[i].real + a->d[2].c[i].imag; + b3->h[0].c[i].imag = a->d[0].c[i].imag - a->d[2].c[i].real; + b3->h[1].c[i].real = a->d[1].c[i].real - a->d[3].c[i].imag; + b3->h[1].c[i].imag = a->d[1].c[i].imag + a->d[3].c[i].real; + } + + } + +/* wp_shrink( a,b4,TUP,sign); */ + + if(sign==PLUS) + { + /* case TUP: */ + for(i=0;i<3;i++){ + b4->h[0].c[i].real = a->d[0].c[i].real + a->d[2].c[i].real; + b4->h[0].c[i].imag = a->d[0].c[i].imag + a->d[2].c[i].imag; + b4->h[1].c[i].real = a->d[1].c[i].real + a->d[3].c[i].real; + b4->h[1].c[i].imag = a->d[1].c[i].imag + a->d[3].c[i].imag; + } + } + else + { + /* case TDOWN: */ + for(i=0;i<3;i++){ + b4->h[0].c[i].real = a->d[0].c[i].real - a->d[2].c[i].real; + b4->h[0].c[i].imag = a->d[0].c[i].imag - a->d[2].c[i].imag; + b4->h[1].c[i].real = a->d[1].c[i].real - a->d[3].c[i].real; + b4->h[1].c[i].imag = a->d[1].c[i].imag - a->d[3].c[i].imag; + } + } +} + +#endif /* "ifndef FAST */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_shrink8.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_shrink8.c new file mode 100644 index 0000000000000000000000000000000000000000..f7816c4beec9257894ece10d72a4d3147231aa86 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_shrink8.c @@ -0,0 +1,39 @@ +/***************** wp_shrink8.c (in su3.a) **************************** +* * +* Shrink a wilson vector in eight directions, producing eight * +* half_wilson_vectors. * +* void wp_shrink_8dir(a,b,sign) * +* wilson_vector *a; half_wilson_vector *b; * +* int sign; * +* B1 <- (1 +- gamma_x)A,, projection * +* argument "sign" is sign of gamma matrix. * +* See wp_shrink.c for definitions of gamma matrices and eigenvectors. * +*/ +#include "complex.h" +#include "su3.h" +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 7 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define TUP 3 +#define TDOWN 4 +#define ZDOWN 5 +#define YDOWN 6 +#define XDOWN 7 + +#define OPP_DIR(dir) (7-(dir)) /* Opposite direction */ +#define NDIRS 8 /* number of directions */ + +void wp_shrink_8dir( wilson_vector *a, half_wilson_vector *b, int sign) { + wp_shrink( a,&(b[XUP]),XUP,sign); + wp_shrink( a,&(b[YUP]),YUP,sign); + wp_shrink( a,&(b[ZUP]),ZUP,sign); + wp_shrink( a,&(b[TUP]),TUP,sign); + wp_shrink( a,&(b[XDOWN]),XDOWN,sign); + wp_shrink( a,&(b[YDOWN]),YDOWN,sign); + wp_shrink( a,&(b[ZDOWN]),ZDOWN,sign); + wp_shrink( a,&(b[TDOWN]),TDOWN,sign); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wvec2_dot.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wvec2_dot.c new file mode 100644 index 0000000000000000000000000000000000000000..65cc34def00ff367f082b1334ec31199173cbdd2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wvec2_dot.c @@ -0,0 +1,26 @@ +/****************** wvec_dot.c (in su3.a) ****************************** +* * +* complex wvec2_dot( wilson_vector *a, wilson_vector *b ) * +* return dot product of two wilson_vectors = a-dagger times b * +*/ +#include "complex.h" +#include "su3.h" + +complex wvec2_dot( wilson_vector *a, wilson_vector *b ){ + complex temp; + wilson_vector c; + register int i,j; + + temp.real = wvec_rdot(a,b); + + for(i=0;i<4;i++){ + for(j=0;j<3;j++){ + c.d[i].c[j].real = -(a->d[i].c[j].imag); + c.d[i].c[j].imag = a->d[i].c[j].real; + } + } + + temp.imag = wvec_rdot(&c,b); + + return(temp); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wvec_dot.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wvec_dot.c new file mode 100644 index 0000000000000000000000000000000000000000..28cd247686da0d35e54471c88abad30f78e3bab3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wvec_dot.c @@ -0,0 +1,84 @@ +/****************** wvec_dot.c (in su3.a) ****************************** +* * +* complex wvec_dot(a,b) wilson_vector *a,*b; * +* return dot product of two wilson_vectors * +*/ +#include "complex.h" +#include "su3.h" + +complex wvec_dot( wilson_vector *a, wilson_vector *b ){ + +#ifndef NATIVEDOUBLE + complex temp1,temp2; + register int i; + temp1.real = temp1.imag = 0.0; + for(i=0;i<4;i++){ + CMULJ_(a->d[i].c[0],b->d[i].c[0],temp2); CSUM(temp1,temp2); + CMULJ_(a->d[i].c[1],b->d[i].c[1],temp2); CSUM(temp1,temp2); + CMULJ_(a->d[i].c[2],b->d[i].c[2],temp2); CSUM(temp1,temp2); + } + return(temp1); + +#else /* RS6000 version */ + + register double ar,ai,br,bi,cr,ci; + register complex cc; + + ar=a->d[0].c[0].real; ai=a->d[0].c[0].imag; + br=b->d[0].c[0].real; bi=b->d[0].c[0].imag; + cr = ar*br + ai*bi; + ci = ar*bi - ai*br; + ar=a->d[0].c[1].real; ai=a->d[0].c[1].imag; + br=b->d[0].c[1].real; bi=b->d[0].c[1].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + ar=a->d[0].c[2].real; ai=a->d[0].c[2].imag; + br=b->d[0].c[2].real; bi=b->d[0].c[2].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + + ar=a->d[1].c[0].real; ai=a->d[1].c[0].imag; + br=b->d[1].c[0].real; bi=b->d[1].c[0].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + ar=a->d[1].c[1].real; ai=a->d[1].c[1].imag; + br=b->d[1].c[1].real; bi=b->d[1].c[1].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + ar=a->d[1].c[2].real; ai=a->d[1].c[2].imag; + br=b->d[1].c[2].real; bi=b->d[1].c[2].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + + ar=a->d[2].c[0].real; ai=a->d[2].c[0].imag; + br=b->d[2].c[0].real; bi=b->d[2].c[0].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + ar=a->d[2].c[1].real; ai=a->d[2].c[1].imag; + br=b->d[2].c[1].real; bi=b->d[2].c[1].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + ar=a->d[2].c[2].real; ai=a->d[2].c[2].imag; + br=b->d[2].c[2].real; bi=b->d[2].c[2].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + + ar=a->d[3].c[0].real; ai=a->d[3].c[0].imag; + br=b->d[3].c[0].real; bi=b->d[3].c[0].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + ar=a->d[3].c[1].real; ai=a->d[3].c[1].imag; + br=b->d[3].c[1].real; bi=b->d[3].c[1].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + ar=a->d[3].c[2].real; ai=a->d[3].c[2].imag; + br=b->d[3].c[2].real; bi=b->d[3].c[2].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + + cc.real = cr; + cc.imag = ci; + return(cc); + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wvec_rdot.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wvec_rdot.c new file mode 100644 index 0000000000000000000000000000000000000000..13b7b744694b95bfa7d7cfef4c5bf92b6287a8c6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wvec_rdot.c @@ -0,0 +1,72 @@ +/***************** wvec_rdot.c (in su3.a) ****************************** +* * +* radix wvec_rdot( wilson_vector *a, wilson_vector *b ) * +* return real part of dot product of two wilson_vectors * +*/ +#include "complex.h" +#include "su3.h" + +radix wvec_rdot( wilson_vector *a, wilson_vector *b ){ + +#ifndef NATIVEDOUBLE + register radix temp1,temp2; + register int i; + temp2=0.0; + for(i=0;i<4;i++){ + temp1 = a->d[i].c[0].real * b->d[i].c[0].real; temp2 += temp1; + temp1 = a->d[i].c[0].imag * b->d[i].c[0].imag; temp2 += temp1; + temp1 = a->d[i].c[1].real * b->d[i].c[1].real; temp2 += temp1; + temp1 = a->d[i].c[1].imag * b->d[i].c[1].imag; temp2 += temp1; + temp1 = a->d[i].c[2].real * b->d[i].c[2].real; temp2 += temp1; + temp1 = a->d[i].c[2].imag * b->d[i].c[2].imag; temp2 += temp1; + } + return(temp2); + +#else /* RS6000 version */ + + register double ar,ai,br,bi,ss; + + ar=a->d[0].c[0].real; ai=a->d[0].c[0].imag; + br=b->d[0].c[0].real; bi=b->d[0].c[0].imag; + ss = ar*br + ai*bi; + ar=a->d[0].c[1].real; ai=a->d[0].c[1].imag; + br=b->d[0].c[1].real; bi=b->d[0].c[1].imag; + ss += ar*br + ai*bi; + ar=a->d[0].c[2].real; ai=a->d[0].c[2].imag; + br=b->d[0].c[2].real; bi=b->d[0].c[2].imag; + ss += ar*br + ai*bi; + + ar=a->d[1].c[0].real; ai=a->d[1].c[0].imag; + br=b->d[1].c[0].real; bi=b->d[1].c[0].imag; + ss += ar*br + ai*bi; + ar=a->d[1].c[1].real; ai=a->d[1].c[1].imag; + br=b->d[1].c[1].real; bi=b->d[1].c[1].imag; + ss += ar*br + ai*bi; + ar=a->d[1].c[2].real; ai=a->d[1].c[2].imag; + br=b->d[1].c[2].real; bi=b->d[1].c[2].imag; + ss += ar*br + ai*bi; + + ar=a->d[2].c[0].real; ai=a->d[2].c[0].imag; + br=b->d[2].c[0].real; bi=b->d[2].c[0].imag; + ss += ar*br + ai*bi; + ar=a->d[2].c[1].real; ai=a->d[2].c[1].imag; + br=b->d[2].c[1].real; bi=b->d[2].c[1].imag; + ss += ar*br + ai*bi; + ar=a->d[2].c[2].real; ai=a->d[2].c[2].imag; + br=b->d[2].c[2].real; bi=b->d[2].c[2].imag; + ss += ar*br + ai*bi; + + ar=a->d[3].c[0].real; ai=a->d[3].c[0].imag; + br=b->d[3].c[0].real; bi=b->d[3].c[0].imag; + ss += ar*br + ai*bi; + ar=a->d[3].c[1].real; ai=a->d[3].c[1].imag; + br=b->d[3].c[1].real; bi=b->d[3].c[1].imag; + ss += ar*br + ai*bi; + ar=a->d[3].c[2].real; ai=a->d[3].c[2].imag; + br=b->d[3].c[2].real; bi=b->d[3].c[2].imag; + ss += ar*br + ai*bi; + + return(ss); + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Goverrelax.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Goverrelax.c new file mode 100644 index 0000000000000000000000000000000000000000..3f0863954bafb64225032c8217592398d5c4c921 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Goverrelax.c @@ -0,0 +1,53 @@ +/********************* HBHiggs.c ************************/ + +#include "lattice.h" + +/* #include "gaussian_ran.c" + */ + +double Goverrelax(int parity, adjoint_matrix *ahiggs, adjoint_matrix *adjstaple) +{ + int i,j,nhit; + double rsum,sm,r2,r2o; + adjoint_matrix m; + + nhit = 0; + rsum = 0.0; + + sm = betaA/beta2; + + forparity(i,parity) { + + prefetch_adjoint(&ahiggs[i+1]); + prefetch_adjoint(&adjstaple[i+1]); + + /* perform a Gaussian overrelax for Higgs: Since + * Act = -bA A.S + b2 A^2 + b4 A^4, we can do + * Act = b2 (A - bA/(2b2) S)^2 + b4 A^4 + * Thus, reflect A using the gaussian potential: + * (A'-bA/2b2 S) = -(A-bA/2b2 S) => + * A' = bA/b2 S - A + * Accept/reject with the change in the A^4-term + */ + + for (r2o=j=0; j<8; j++) r2o += sqr( ahiggs[i].l[j] ); + + for (r2=j=0; j<8; j++) { + m.l[j] = sm * adjstaple[i].l[j] - ahiggs[i].l[j]; + r2 += sqr( m.l[j] ); + } + /* acc/rej with A^4 */ + if ( exp( beta4*(r2o*r2o - r2*r2) ) >= dran() ) { + ahiggs[i] = m; + nhit++; + rsum += r2; + } else rsum += r2o; + } + + nhitog++; + if (parity == EVEN) ahitog += 1.0*nhit/node.evensites; + else ahitog += 1.0*nhit/node.oddsites; + + return(rsum); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/HBHiggs.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/HBHiggs.c new file mode 100644 index 0000000000000000000000000000000000000000..ef7448a22d707e46a394f7c2f850cfbce9d16ca6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/HBHiggs.c @@ -0,0 +1,53 @@ +/********************* HBHiggs.c ************************/ + +#include "lattice.h" + +/* #include "gaussian_ran.c" + */ + +double HBHiggs(int parity, adjoint_matrix *ahiggs, adjoint_matrix *adjstaple) +{ + int i,j,ntry; + double rsum,w,sm,r2,r4o; + + ntry = 0; + rsum = 0.0; + + w = 1.0/sqrt(beta2); + sm = betaA/(2.0*beta2); + + forparity(i,parity) { + + prefetch_adjoint(&ahiggs[i+1]); + prefetch_adjoint(&adjstaple[i+1]); + + /* perform a heat bath update for Higgs: Since + * Act = -bA A.adjStaple + b2 A^2 + b4 A^4, we can do + * Act = b2 (A - bA/(2b2) adjStaple)^2 + b4 A^4 + * Thus, pull A from a gaussian distribution + * A = bA/2b2 S + 1/sqrt(b2) gaussian_ran() + * and acc/rej with the b4-term + */ + + /* for (r4o=j=0; j<8; j++) r4o += sqr( st->ahiggs.l[j] ); + r4o *= r4o; */ + + do { + ++ntry; + for (r2=j=0; j<8; j++) { + ahiggs[i].l[j] = sm * adjstaple[i].l[j] + w * gaussian_ran(); + r2 += sqr( ahiggs[i].l[j] ); + } + /* acc/rej with A^4 */ + } while ( exp( -beta4*r2*r2 ) < dran() ); /* loop until ok */ + + rsum += r2; + } + + nhithb++; + if (parity == EVEN) ahithb += 1.0*node.evensites/ntry; + else ahithb += 1.0*node.oddsites/ntry; + + return(rsum); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Make_sse_vanilla b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Make_sse_vanilla new file mode 100644 index 0000000000000000000000000000000000000000..b52966753067b8538d9ae7c59b08b63ac2909713 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Make_sse_vanilla @@ -0,0 +1,127 @@ +# Makefile for the hybrid molecular dynamics simulation with +# pure gauge SU3 +# MIMD version 3 +# + +#Where the complex and su3 libraries are +#c code +LIBDIR = ../libraries +GENERIC = ../generic +SSEDIR = ../sse +INCLDIR = -I$(GENERIC) -I$(LIBDIR) -I$(SSEDIR) + +HEADERS= $(LIBDIR)/complex.h $(LIBDIR)/su3.h $(GENERIC)/comdefs.h $(GENERIC)/generic.h $(GENERIC)/generic_su3.h lattice.h + +# include file defining site structure, etc., for compiling generic code */ +LATDEF = -DLATDEF='"../su3h_n/lattice.h"' +# The quotation marks are necessary, as is the "../pure_gauge", since +# some compilations will be done in another directory + +# Choose one of the lattice layout algorithms: + +OBJECTS=control.o updategauge.o relax.o monte.o measure.o \ + setup.o setup_basic.o \ + reunitarize.o staples_su3.o \ + layout.o io_lattice.o \ + mersenne_inline.o parameter_io.o timecheck.o \ + random_su3P.o +# check_unitarity.o is not used +# correlation.o + +HOBJECTS=updatehiggs.o Xoverrelax.o HBHiggs.o adjmat_operations.o \ + setcouplings_higgs.o \ + multican.o smooth_field_su3adjoint.o block_field_su3adjoint.o \ + block_lattice.o correlation.o smooth_link_su3.o block_link_su3.o \ + gaussian_ran.o + +#MACHINE_DEP = com_intelsim.o +#MACHINE_DEP = com_intel.o +MACHINE_DEP = com_vanilla.o + +#Library for multinode communication and information functions +#ILIB= /usr/local/lib/bsimlib.a #preon.physics.arizona.edu +#ILIB= -node #Intel machine +ILIB= #vanilla + +#Libraries for complex numbers and su3 functions +QCDLIB = $(LIBDIR)/su3.a $(LIBDIR)/complex.a + +#CFLAGS= -g -f $(INCLDIR) -DPROTO #MIPS +#CFLAGS= -g -fsingle $(INCLDIR) #Sun +CFLAGS= -O4 -DPROTO $(INCLDIR) $(LATDEF) #gnu c compiler +#CFLAGS = -non_shared -O4 -std1 -arch ev6 -DFAST -DPROTO $(INCLDIR) $(LATDEF) -float #Dec alpha compiler +#CFLAGS = -O4 -std1 -arch ev6 -DFAST -DPROTO $(INCLDIR) -float #Dec alpha compiler +#CFLAGS = -O4 -std1 -DFAST -DPROTO $(INCLDIR) -float #Dec alpha compiler +#CFLAGS = -O -DFAST -DPROTO $(INCLDIR) #Dec alpha compiler +#CFLAGS = -g -Wall -DFAST -DPROTO $(INCLDIR) #Dec alpha compiler + +COMPILER = cc #generic, for simulator +#COMPILER = gcc #Intel Green Hills compiler (SDSC only) +#COMPILER = icc #Intel pgcc + +DEFINES = -DCAN_DO_ALLOCA # can do alloca + + +.c.o: ; $(COMPILER) $(CFLAGS) -c $(DEFINES) $*.c + +$(OBJECTS) $(EXTRA_OBJECTS) : $(HEADERS) +su3_ahiggs:: + make -f Make_vanilla target "TARGET= su3_ahiggs" \ + "DEFINES= -DHIGGS -DSSE_INLINE" "EXTRA_OBJECTS= $(HOBJECTS)" + +su3_gauge:: + make -f Make_vanilla target "TARGET= su3_gauge" \ + "DEFINES= " "EXTRA_OBJECTS= setcouplings_gauge.o" + +su3_4d:: + make -f Make_vanilla target "TARGET= su3_4d" \ + "DEFINES= -DDIMENSION=4 -DP4 -DSSE -DSSE_INLINE" \ + "EXTRA_OBJECTS= setcouplings_gauge.o" + + +clean: + rm -f *.o + + +# Choose one of the lattice layout algorithms: +layout.o: ../generic/layout.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/layout.c +reunitarize.o: ../generic/reunitarize.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/reunitarize.c +com_vanilla.o: ../generic/com_vanilla.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/com_vanilla.c +mersenne_inline.o: ../generic/mersenne_inline.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/mersenne_inline.c +setup_basic.o: ../generic/setup_basic.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/setup_basic.c +staples_su3.o: ../generic/staples_su3.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/staples_su3.c +random_su3P.o: ../generic/random_su3P.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/random_su3P.c +parameter_io.o: ../generic/parameter_io.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/parameter_io.c +timecheck.o: ../generic/timecheck.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/timecheck.c + +gaussian_ran.o: ../generic/gaussian_ran.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/gaussian_ran.c + +block_lattice.o: ../generic/block_lattice.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/block_lattice.c +smooth_field_su3adjoint.o: ../generic/smooth_field_su3adjoint.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/smooth_field_su3adjoint.c +smooth_link_su3.o: ../generic/smooth_link_su3.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/smooth_link_su3.c +block_field_su3adjoint.o: ../generic/block_field_su3adjoint.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/block_field_su3adjoint.c +block_link_su3.o: ../generic/block_link_su3.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/block_link_su3.c + + + +target: $(OBJECTS) $(MACHINE_DEP) $(EXTRA_OBJECTS) $(QCDLIB) + $(COMPILER) $(CFLAGS) -o $(TARGET) $(DEFINES) \ + $(OBJECTS) $(MACHINE_DEP) $(EXTRA_OBJECTS) $(QCDLIB) $(ILIB) -lm + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Make_vanilla b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Make_vanilla new file mode 100644 index 0000000000000000000000000000000000000000..0d1866b29edadeb57c434ba4232b28da87c074da --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Make_vanilla @@ -0,0 +1,126 @@ +# Makefile for the hybrid molecular dynamics simulation with +# pure gauge SU3 +# MIMD version 3 +# + +#Where the complex and su3 libraries are +#c code +LIBDIR = ../libraries +GENERIC = ../generic +INCLDIR = -I$(GENERIC) -I$(LIBDIR) + +HEADERS= $(LIBDIR)/complex.h $(LIBDIR)/su3.h $(GENERIC)/comdefs.h $(GENERIC)/generic.h $(GENERIC)/generic_su3.h lattice.h + +# include file defining site structure, etc., for compiling generic code */ +LATDEF = -DLATDEF='"../su3h_n/lattice.h"' +# The quotation marks are necessary, as is the "../pure_gauge", since +# some compilations will be done in another directory + +# Choose one of the lattice layout algorithms: + +OBJECTS=control.o updategauge.o relax.o monte.o measure.o \ + setup.o setup_basic.o \ + reunitarize.o staples_su3.o \ + layout.o io_lattice.o \ + mersenne_inline.o parameter_io.o timecheck.o \ + random_su3P.o gaugefix.o +# check_unitarity.o is not used +# correlation.o + +HOBJECTS=updatehiggs.o Xoverrelax.o Goverrelax.o HBHiggs.o adjmat_operations.o \ + setcouplings_higgs.o \ + multican.o smooth_field_su3adjoint.o block_field_su3adjoint.o \ + block_lattice.o correlation.o smooth_link_su3.o block_link_su3.o \ + gaussian_ran.o + +#MACHINE_DEP = com_intelsim.o +#MACHINE_DEP = com_intel.o +MACHINE_DEP = com_vanilla.o + +#Library for multinode communication and information functions +#ILIB= /usr/local/lib/bsimlib.a #preon.physics.arizona.edu +#ILIB= -node #Intel machine +ILIB= #vanilla + +#Libraries for complex numbers and su3 functions +QCDLIB = $(LIBDIR)/su3.a $(LIBDIR)/complex.a + +#CFLAGS= -g -f $(INCLDIR) -DPROTO #MIPS +#CFLAGS= -g -fsingle $(INCLDIR) #Sun +CFLAGS= -O4 -DPROTO $(INCLDIR) $(LATDEF) #gnu c compiler +#CFLAGS = -non_shared -O4 -std1 -arch ev6 -DFAST -DPROTO $(INCLDIR) $(LATDEF) -float #Dec alpha compiler +#CFLAGS = -O4 -std1 -arch ev6 -DFAST -DPROTO $(INCLDIR) -float #Dec alpha compiler +#CFLAGS = -O4 -std1 -DFAST -DPROTO $(INCLDIR) -float #Dec alpha compiler +#CFLAGS = -O -DFAST -DPROTO $(INCLDIR) #Dec alpha compiler +#CFLAGS = -g -Wall -DFAST -DPROTO $(INCLDIR) #Dec alpha compiler + +COMPILER = cc #generic, for simulator +#COMPILER = gcc #Intel Green Hills compiler (SDSC only) +#COMPILER = icc #Intel pgcc + +DEFINES = -DCAN_DO_ALLOCA # can do alloca + + +.c.o: ; $(COMPILER) $(CFLAGS) -c $(DEFINES) $*.c + +$(OBJECTS) $(EXTRA_OBJECTS) : $(HEADERS) +su3_ahiggs:: + make -f Make_vanilla target "TARGET= su3_ahiggs" \ + "DEFINES= -DHIGGS" "EXTRA_OBJECTS= $(HOBJECTS)" + +su3_gauge:: + make -f Make_vanilla target "TARGET= su3_gauge" \ + "EXTRA_OBJECTS= setcouplings_gauge.o" + +su3_4d:: + make -f Make_vanilla target "TARGET= su3_4d" \ + "DEFINES= -DDIMENSION=4" \ + "EXTRA_OBJECTS= setcouplings_gauge.o ploop.o" + + +clean: + rm -f *.o + + +# Choose one of the lattice layout algorithms: +layout.o: ../generic/layout.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/layout.c +reunitarize.o: ../generic/reunitarize.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/reunitarize.c +com_vanilla.o: ../generic/com_vanilla.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/com_vanilla.c +mersenne_inline.o: ../generic/mersenne_inline.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/mersenne_inline.c +setup_basic.o: ../generic/setup_basic.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/setup_basic.c +staples_su3.o: ../generic/staples_su3.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/staples_su3.c +random_su3P.o: ../generic/random_su3P.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/random_su3P.c +parameter_io.o: ../generic/parameter_io.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/parameter_io.c +timecheck.o: ../generic/timecheck.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/timecheck.c + +gaussian_ran.o: ../generic/gaussian_ran.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/gaussian_ran.c + +block_lattice.o: ../generic/block_lattice.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/block_lattice.c +smooth_field_su3adjoint.o: ../generic/smooth_field_su3adjoint.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/smooth_field_su3adjoint.c +smooth_link_su3.o: ../generic/smooth_link_su3.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/smooth_link_su3.c +block_field_su3adjoint.o: ../generic/block_field_su3adjoint.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/block_field_su3adjoint.c +block_link_su3.o: ../generic/block_link_su3.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/block_link_su3.c + + + +target: $(OBJECTS) $(MACHINE_DEP) $(EXTRA_OBJECTS) $(QCDLIB) + $(COMPILER) $(CFLAGS) -o $(TARGET) $(DEFINES) \ + $(OBJECTS) $(MACHINE_DEP) $(EXTRA_OBJECTS) $(QCDLIB) $(ILIB) -static -lm + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..8fdd943f82f64dd57b4353cf552ca7018450c575 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Makefile @@ -0,0 +1,108 @@ +include ../Makefile.defs + +# MPI C compiler +# MPI_CC = mpcc_r #IBM RS6000 + +#Where the complex and su3 libraries are +LIBDIR = ../libraries +GENERIC = ../generic +INCLDIR = -I$(GENERIC) -I$(LIBDIR) + +HEADERS= $(LIBDIR)/complex.h $(LIBDIR)/su3.h $(GENERIC)/comdefs.h $(GENERIC)/generic.h $(GENERIC)/generic_su3.h lattice.h + +# include file defining site structure, etc., for compiling generic code */ +LATDEF = -DLATDEF='"../su3h_n/lattice.h"' + +OBJECTS=control.o updategauge.o relax.o monte.o measure.o \ + setup.o setup_basic.o timers.o \ + reunitarize.o staples_su3.o \ + layout.o io_lattice.o \ + mersenne_inline.o parameter_io.o timecheck.o \ + random_su3P.o +# check_unitarity.o is not used +# correlation.o + +HOBJECTS=updatehiggs.o Xoverrelax.o Goverrelax.o HBHiggs.o adjmat_operations.o \ + setcouplings_higgs.o \ + multican.o smooth_field_su3adjoint.o block_field_su3adjoint.o \ + block_lattice.o correlation.o smooth_link_su3.o block_link_su3.o \ + gaussian_ran.o + +MACHINE_DEP = com_mpi.o + +#Libraries for complex numbers and su3 functions +QCDLIB = $(LIBDIR)/su3.a $(LIBDIR)/complex.a + +#PABS -DHIGGS added +CFLAGS += -DMPI -DFAST $(INCLDIR) $(LATDEF) -DTIMERS -DHIGGS + +COMPILER = $(MPI_CC) + +.SUFFIXES: +.SUFFIXES: .o .t3e .c .y .l .s + +.c.o: + $(COMPILER) $(CFLAGS) -c $(DEFINES) $*.c +.s.o: + $(COMPILER) $(CFLAGS) -c $(DEFINES) $*.s + +$(OBJECTS) $(EXTRA_OBJECTS) : $(HEADERS) + +su3_ahiggs:: + $(MAKE) target "MYTARGET= su3_ahiggs" \ + "DEFINES= -DHIGGS" "EXTRA_OBJECTS= $(HOBJECTS)" + +su3_gauge:: + $(MAKE) target "MYTARGET= su3_gauge" \ + "EXTRA_OBJECTS= setcouplings_gauge.o" + +su3_4d:: + $(MAKE) target "MYTARGET= su3_4d" \ + "DEFINES= -DDIMENSION=4" \ + "EXTRA_OBJECTS= setcouplings_gauge.o ploop.o" + +clean: + $(RM) -f *.o + +# Choose one of the lattice layout algorithms: +layout.o: ../generic/layout.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/layout.c +reunitarize.o: ../generic/reunitarize.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/reunitarize.c +timers.o: ../generic/timers.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/timers.c +com_mpi.o: ../generic/com_mpi.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/com_mpi.c +mersenne_inline.o: ../generic/mersenne_inline.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/mersenne_inline.c +setup_basic.o: ../generic/setup_basic.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/setup_basic.c +staples_su3.o: ../generic/staples_su3.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/staples_su3.c +random_su3P.o: ../generic/random_su3P.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/random_su3P.c +parameter_io.o: ../generic/parameter_io.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/parameter_io.c +timecheck.o: ../generic/timecheck.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/timecheck.c + +gaussian_ran.o: ../generic/gaussian_ran.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/gaussian_ran.c + +block_lattice.o: ../generic/block_lattice.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/block_lattice.c +smooth_field_su3adjoint.o: ../generic/smooth_field_su3adjoint.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/smooth_field_su3adjoint.c +smooth_link_su3.o: ../generic/smooth_link_su3.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/smooth_link_su3.c +block_field_su3adjoint.o: ../generic/block_field_su3adjoint.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/block_field_su3adjoint.c +block_link_su3.o: ../generic/block_link_su3.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/block_link_su3.c + +target: $(OBJECTS) $(MACHINE_DEP) $(EXTRA_OBJECTS) $(QCDLIB) + $(COMPILER) $(CFLAGS) -o $(MYTARGET) $(DEFINES) \ + $(OBJECTS) $(MACHINE_DEP) $(EXTRA_OBJECTS) $(QCDLIB) $(LDFLAGS) + +kernel-objects: $(OBJECTS) $(MACHINE_DEP) $(HOBJECTS) $(QCDLIB) + echo "mache kernel_B" \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Xoverrelax.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Xoverrelax.c new file mode 100644 index 0000000000000000000000000000000000000000..f78cfad596d113db447e5ffd3990f2ebc8482686 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Xoverrelax.c @@ -0,0 +1,130 @@ +/************************************************************** + * * + * adjoint overrelaxation routine * + * * + *************************************************************/ + +#include "lattice.h" + + +double Xoverrelax(int parity, adjoint_matrix *ahiggs, adjoint_matrix *adjstaple) +{ + double rp,x,x2,y2,xn,a,b,c,vi,b0,b1,b2,f1,f1p3,f2,f3,f4,f5,f6; + double xa,xb,pr,r2,a1; + int i,j,nhits; + double rsum; + + nhits = rsum = 0; + + /* the coeffs of the polynomial + * s = -betaA v p + beta2 v^2 + beta4 v^4 + */ + + vi = 1.0/beta4; + + forparity(i,parity) { + + prefetch_adjoint(&ahiggs[i+1]); + prefetch_adjoint(&adjstaple[i+1]); + + rp = a1 = r2 = 0.0; + for (j=0; j<8; j++) { + a1 += ahiggs[i].l[j] * adjstaple[i].l[j]; + r2 += sqr(ahiggs[i].l[j]); + rp += sqr(adjstaple[i].l[j]); + } + rp = sqrt(rp); + + /* vector angle is given by V*P = cos theta; x = V*\hatP */ + + x = a1/rp; + x2 = sqr(x); + y2 = r2 - x2; + + /* NOW rr == -betaA * rp; + * vx2 == beta2 + * vx4 == beta4 + * act = rr*x + vx2*(x2+y2) + vx4*(x2+y2)^2 + * calculate the coeffs. of the 4-th order action polynomial + * act = rr*x + vx2*(x2+y2) + vx4*(x2+y2)^2 + * => vx4 x^4 + (vx2 + 2*vx4*y2) x^2 + rr x == v0 + * => x^4 + [(vx2 + 2*vx4*y2)/vx4] x^2 + rr/vx4 x + * + [-v0/vx4] == 0 + */ + + b2 = beta2*vi + 2.0*y2; + b1 = -betaA*rp*vi; + b0 = -(b1*x + x2*(b2 + x2)); + + /* (x-x0)(x^3 + ax^2 + bx + c) = x^4 + b2 x^2 + b1 x + b0 */ + + a = x; + b = b2 + x2; + c = b1 + x*b; + + /* if (abs(1.0 + c/(b0/a)) .gt. 1e-10) write(*,*)'c-errror',c,b0/a */ + + /* Now find the zeros of the 3-deg polynomial + * (x^3 + ax^2 + bx + c) + */ + + f1 = -sqr(a) + 3.0*b; + f1p3 = f1*f1*f1; + f2 = -2.0*a*a*a + 9.0*a*b; + f6 = f2 - 27.0*c; + f4 = 4.0*f1p3 + sqr(f6); + + if (f4 >= 0.0) { + + /* only one real solution exists now, this is all what is accepted */ + + f5 = sqrt(f4) + f6; + if (f5 > 0.0) { + f3 = pow(0.5*f5,((double)1.0)/((double)3.0)); + xn = (-a - f1/f3 + f3)*(1.0/3.0); + + /* Now accept/reject the update with the derivatives + * d[x^4 + b2 x^2 + b1 x + b0] = 4 x^3 + 2 b2 x + b1 + */ + + xa = x*(4.0*x2 + 2.0*b2) + b1; + xb = xn*(4.0*sqr(xn) + 2.0*b2) + b1; + pr = fabs(xa/xb); + + if (pr >= dran()) { + nhits++; + + /* generate new adj. -- now we have x and xn wrt. p => + * v <- v + (xn-x) \hat p + */ + + for (j=0; j<8; j++) { + ahiggs[i].l[j] += adjstaple[i].l[j] * ((xn-x)/rp); + } + + r2 = y2 + xn*xn; + } + + rsum += r2; /* cumulate r2-value */ + + } else { + printf(" *** OR branch 1, value of f4: %g f5: %g\n",f4,f5); + halt("****** OR stop"); + } + } else { + printf(" OR branch 2, value of f4: %g f5: %g\n",f4,f5); + halt("****** OR stop"); + } + } /* FORSOMEPARITY */ + + if (parity == EVEN) ahitax += 1.0*nhits/(node.evensites); + else ahitax += 1.0*nhits/(node.oddsites); + nhitax++; + + return(rsum); + +} + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/adjmat_operations.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/adjmat_operations.c new file mode 100644 index 0000000000000000000000000000000000000000..75620420cb610cf7e7c397810e3eb3bd3b9ed995 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/adjmat_operations.c @@ -0,0 +1,163 @@ + +/* + * void make_adjointmat( su3_matrix *m3, adjointmat *ah3) + * takes the hermitian and traceless part of su3_matrix + * in terms of generators + */ +#include "complex.h" +#include "su3.h" + +#define sqrt3 1.7320508075688772 + +void +compress_adjmat(m3,a3) + su3_matrix *m3; + adjoint_matrix *a3; +{ + a3->l[0] = m3->e[0][0].real - m3->e[1][1].real; + a3->l[1] = (1.0/sqrt3)*(m3->e[0][0].real + + m3->e[1][1].real - 2.0*m3->e[2][2].real); + + a3->l[2] = m3->e[0][1].real + m3->e[1][0].real; + a3->l[3] = m3->e[0][1].imag - m3->e[1][0].imag; + a3->l[4] = m3->e[0][2].real + m3->e[2][0].real; + a3->l[5] = m3->e[0][2].imag - m3->e[2][0].imag; + a3->l[6] = m3->e[1][2].real + m3->e[2][1].real; + a3->l[7] = m3->e[1][2].imag - m3->e[2][1].imag; +}/* make_adjmat */ + + +/* + * void uncompress_adjointmat( su3_matrix *m3, adjointmat *ah3) + * takes the adjoint matrix and throws it in SU(3)-matrix + */ + +void +uncompress_adjmat(a3,m3) + su3_matrix *m3; + adjoint_matrix *a3; +{ + radix t; + + t = a3->l[1]*(1.0/sqrt3); + + m3->e[0][0].real = 0.5*(a3->l[0] + t); + m3->e[0][0].imag = 0.0; + m3->e[1][1].real = 0.5*(-a3->l[0] + t); + m3->e[1][1].imag = 0.0; + m3->e[2][2].real = -t; + m3->e[2][2].imag = 0.0; + + m3->e[0][1].real = m3->e[1][0].real = 0.5*a3->l[2]; + m3->e[0][1].imag = 0.5*a3->l[3]; + m3->e[1][0].imag = -0.5*a3->l[3]; + + m3->e[0][2].real = m3->e[2][0].real = 0.5*a3->l[4]; + m3->e[0][2].imag = 0.5*a3->l[5]; + m3->e[2][0].imag = -0.5*a3->l[5]; + + m3->e[1][2].real = m3->e[2][1].real = 0.5*a3->l[6]; + m3->e[1][2].imag = 0.5*a3->l[7]; + m3->e[2][1].imag = -0.5*a3->l[7]; +}/* uncmp_adjointmat */ + + +/* void make_adjointmat( su3_matrix *m3, adjointmat *ah3) + * takes the hermitian and traceless part of su3_matrix + * in terms of generators + */ + +void +make_adjointmat(m3,a3) + su3_matrix *m3; + adjoint_matrix *a3; +{ + compress_adjmat(m3,a3); +} + + +/****************************************************** + * + * adjoint arithmetics + * + *****************************************************/ + +void +add_adjmat(a,b,t) + adjoint_matrix *a,*b,*t; +{ + int i; + for (i=0; i<8; i++) t->l[i] = a->l[i] + b->l[i]; +} + + +void +adj_scalar_mul(a,s,t) + adjoint_matrix *a,*t; + double s; +{ + int i; + for (i=0; i<8; i++) t->l[i] = (s) * a->l[i]; +} + + +void +adj_scalar_mul_add(a,s,t) + adjoint_matrix *a,*t; + double s; +{ + int i; + for (i=0; i<8; i++) t->l[i] += (s) * a->l[i]; +} + +radix +adj_sqr(adjoint_matrix *a) +{ + int i; + radix f; + for (f=i=0; i<8; i++) f += a->l[i] * a->l[i]; + return(f); +} + +radix +adj_dot(adjoint_matrix *a,adjoint_matrix *b) +{ + int i; + radix f; + for (f=i=0; i<8; i++) f += a->l[i] * b->l[i]; + return(f); +} + + +void +mult_su3_ahiggs( su3_matrix *m, adjoint_matrix *a, adjoint_matrix *r ) +{ + su3_matrix tmat1,tmat2; + + uncompress_adjmat( a, &tmat1 ); + mult_su3_nn( m, &tmat1, &tmat2 ); + mult_su3_na( &tmat2, m, &tmat1 ); + compress_adjmat( &tmat1, r ); +} + +void +mult_adj_su3_ahiggs( su3_matrix *m, adjoint_matrix *a, adjoint_matrix *r ) +{ + su3_matrix tmat1,tmat2; + + uncompress_adjmat( a, &tmat1 ); + mult_su3_na( m, &tmat1, &tmat2 ); + mult_su3_nn( &tmat2, m, &tmat1 ); + compress_adjmat( &tmat1, r ); +} + + + + +void +mult_su3_by_I(su3_matrix *a, su3_matrix *b) +{ + int i,j; + + for(i=0;i<3;i++)for(j=0;j<3;j++) CMUL_I(a->e[i][j],b->e[i][j]); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/control.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/control.c new file mode 100644 index 0000000000000000000000000000000000000000..91c9261b512cc8bace7e2b8e46fdeb04828069be --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/control.c @@ -0,0 +1,230 @@ +/******************************************************************** + * * + * SU(3) adjoint Higgs field in 3d * + * * + * coupling constants: betag, x, y * + * action is always exp[-S] * + * * + * Kari Rummukainen, May 97 (MIMD) * + *******************************************************************/ + +#define CONTROL +#include "lattice.h" /* global variables for lattice fields */ + + +void runthis(int maxiters,int status); + +static int istimelimit = 0; + +/* PABS main -> kernel_b */ + +int +kernel_b() +{ + int kernel_number = 1; + int s_iteration; + int i,status; + time_t t; + double at; + + /* JuBE */ + /* call jube initial function */ + jube_kernel_init(&kernel_number); + + + t = time(NULL); + + /* Machine initialization first */ + initial_setup(); + + /* set up */ + status = setup(); + + /* allocate the variable fields */ + foralldir(i) U[i] = new_latfield( su3_matrix ); +#ifdef HIGGS + ahiggs = new_latfield( adjoint_matrix ); +#endif + + /* load in config, if it exists */ + load_config(status); + + /* Setup measurement etc files */ + setfiles((status == 0) && (iteration != 0)); + +#ifdef HIGGS + /* check if we want to do this multicanonically */ + setmulti(); +#endif + + /* timelimit, if it is used */ +/* PABS, no time limit */ +/* istimelimit = setup_timelimit(t,argc-1,argv[1]); */ + istimelimit = 0; + + fflush(stdout); + + /* first, set the Metropolis scales and thermalise */ + if (status >= 1) { + runthis(n_thermal,1); + } + + if (this_node == 0) { + printf(" - Time spent thermalising: %lg seconds\n",cputime()); + fflush(stdout); + } + + resettime(); /* reset the clock */ + + if (status > 0) timeu = timea = timerest = 0; + ahithb = nhithb = ahitu = nhitu = 0; +#ifdef HIGGS + ahitua = ahitax = ahitmc = ahitog = 0.0; + nhitua = nhitax = nhitmc = nhitog = 0; +#endif + s_iteration = iteration; + + /* JuBE */ + /* call jube run function */ + jube_kernel_run(); + + runthis(n_iteration,0); + + /* JuBE */ + /* call jube finalize function */ + jube_kernel_finalize(); + + + /* print the tail */ + + if (this_node == 0) { + printf("---------\n"); + + printf("Acceptances (after last start: %d iterations:\n", + n_iteration-s_iteration); + if (nhitu) + printf(" Kennedy-Pendleton for gauge: %g (%d sweeps)\n", + ahitu/nhitu,nhitu); +#ifdef HIGGS + if (nhitua) + printf(" Adjoint acceptance for gauge: %g (%d sweeps)\n", + ahitua/nhitua,nhitua); + if (nhitax) + printf(" X-overrelaxation for Higgs: %g (%d sweeps)\n", + ahitax/nhitax,nhitax); + if (nhitog) + printf(" Gaussian overrelaxation for Higgs: %g (%d sweeps)\n", + ahitog/nhitog,nhitog); + if (nhithb) + printf(" Heat bath for Higgs: %g (%d sweeps)\n", + ahithb/nhithb,nhithb); + if (nhitmc) + printf(" Multicanonical acceptance: %g (%d sweeps)\n", + ahitmc/nhitmc,nhitmc); +#endif + + at = timeu+timea+timerest; + printf("\nCpu times:\n"); + printf(" %9.1lf total time in seconds\n",at); + printf(" %9.3lf seconds for one cycle\n",at/n_iteration); + at = 1.0/((mc_steps+1) * n_iteration * lattice.volume); + printf(" %9.1lf seconds for su3 gauge field update\n",timeu); + printf(" %9.3lf microseconds/U/update\n",1e6*timeu*at/3); +#ifdef HIGGS + printf(" %9.1lf seconds for Higgs update\n",timea); + printf(" %9.3lf microseconds/Higgs/update\n",1e6*timea*at); + printf(" %9.1lf seconds for the rest\n",timerest); +#endif + +#ifdef check + print_check(); +#endif + at = cputime(); + printf("Resources:"); + printf(" Cpu: %lg\n",at); + t = time(NULL) - t; + printf(" Wallclock time %ld seconds, Cpu/Wall %lg\n",t,at/t); + + printf("#########\n"); + } +#ifdef MPI + report_comm_timers(); +/* MPI_Finalize(); */ +#endif + + /* JuBE */ + /* call jube finalize function */ + jube_kernel_end(); + + return 0; +} + + +/************************************************* + * do the whole run + */ + +void +runthis(int maxiters,int status) +{ + int meas,i; + + if (istimelimit) inittimecheck(); + + meas = (status == 0); + iteration ++; + for (; iteration <= maxiters; iteration++) { + for (i=0; i F + * + * 1/2 Tr(X UU F') + */ + + static int not_alloc = 1; + int z,zd,i,d,blev,nz; + double w,th,tk,tH,tK,tr2,tr3,tp,tap; + static double_complex *hz1[2][MAX_BOP]; + static double *hz0[2][MAX_BOP],*rz[2][MAX_BOP], *pz[2][MAX_BOP], *fz_array; + + nz = lattice.size[ZUP]; + + if (not_alloc) { + /* first, allocate the needed arrays and set pointers */ + + not_alloc = 0; + fz_array = (double *)calloc(nz*n_bop*(1*2*3 + 2*2*1),sizeof(double)); + for (i=0; i 1) { + free_latfield( b_higgs ); + foralldir(i) free_latfield( b_link[i] ); + } + + reset_blocking_level(); +} + +/**************************************************************** + * * + * Get the plaquette to ->staple * + * Symmetrize it to a clover form * + * * + ***************************************************************/ + +void +getclover(su3_matrix *b_link[NDIM], su3_matrix *clover) +{ + int i; + msg_tag *tag0,*tag1; + su3_matrix ta,tb,*tmpmat; + + /* gather up-links */ + + tmpmat = tmp_latfield( su3_matrix ); + + tag0 = start_get( b_link[YUP], XUP, EVENODD ); + tag1 = start_get( b_link[XUP], YUP, EVENODD ); + + /* multiply up-up -plaq */ + forallsites_wait2(i,tag0,tag1) { + mult_su3_nn( &b_link[XUP][i],&b_link[YUP][nb(XUP,i)], &ta); + mult_su3_na( &ta, &b_link[XUP][nb(YUP,i)], &tb); + mult_su3_na( &tb, &b_link[YUP][i], &clover[i] ); + /* shift it YUP too */ + mult_su3_an( &b_link[YUP][i], &tb, &tmpmat[i] ); + } + + /* move plaq YUP */ + tag0 = start_get( tmpmat, YDOWN, EVENODD ); + forallsites_wait(i,tag0) { + add_su3_matrix( &clover[i], &tmpmat[nb(YDOWN,i)], &clover[i] ); + } + + /* this can not be merged with the one above! */ + /* prepare for XUP */ + forallsites(i) { + mult_su3_an( &b_link[XUP][i], &clover[i], &ta ); + mult_su3_nn( &ta, &b_link[XUP][i], &tmpmat[i] ); + } + + /* move XUP */ + tag0 = start_get( tmpmat, XDOWN, EVENODD ); + forallsites_wait(i,tag0) { + add_su3_matrix( &clover[i], &tmpmat[nb(XDOWN,i)], &clover[i] ); + } + + free_tmp( tmpmat ); +} + +/**************************************************************** + * * + * Calculate the blocked correlations * + * * + ***************************************************************/ + +void +Hvalues(int i, su3_matrix *clover, adjoint_matrix *b_higgs, + double *h10, double_complex *h11, double *h20, + double_complex *h21) +{ + /* this now calculates H_i = i eps_ijk Tr A0 U_jk + * assumes that clover is in ->staple, and + * adjoint higgs in b_higgs + */ + + su3_matrix u,a,a2; + double td,tr; + + td = pi2*((double)xcoord(i))/lattice.size[XUP]; + + /* calculate I (U - U') */ + su3_adjoint( &clover[i], &u); + sub_su3_matrix( &clover[i], &u, &a); + mult_su3_by_I( &a, &u ); + + uncompress_adjmat( &b_higgs[i], &a); + + /* get A0 U_12, this is real */ + + *h10 = tr = realtrace_su3( &a, &u); /* a' *u */ + h11->real = cos(td) * tr; + h11->imag = sin(td) * tr; + + /* and also A0^2 U_12, real */ + + mult_su3_nn( &a, &a, &a2); + *h20 = tr = realtrace_su3( &a2, &u); /* also a2' * u */ + h21->real = cos(td) * tr; + h21->imag = sin(td) * tr; +} + + +/**************************************************************** + * * + * A0^3 from ->b_higgs * + * * + ***************************************************************/ + +double +R3value(int i, adjoint_matrix *b_higgs) +{ + su3_matrix a,a2; + + uncompress_adjmat(&b_higgs[i], &a); + mult_su3_nn(&a, &a, &a2); + return((double)realtrace_su3( &a2, &a)); /* a2' a */ +} + + +/**************************************************************** + * * + * calculate plaq. correlations * + * uses ->staple ! * + * and A0 in -> b_higgs * + * * + ***************************************************************/ + +double +Pvalue(int i, double *ap, su3_matrix *clover, adjoint_matrix *b_higgs) +{ + int k; + double p; + su3_matrix a,p2; + + for (p=k=0; k<3; k++) p += clover[i].e[k][k].real; + + su3_adjoint(&clover[i], &a); + add_su3_matrix(&clover[i], &a, &p2); + uncompress_adjmat(&b_higgs[i], &a); + + *ap = (double)realtrace_su3( &p2, &a); /* p2' * a */ + return(p); +} + +/*************************************************************/ + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/gaugefix.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/gaugefix.c new file mode 100644 index 0000000000000000000000000000000000000000..3beb4feeff7185bec825dae13b0bee91a64c8279 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/gaugefix.c @@ -0,0 +1,356 @@ +/************************** gaugefix.c *******************************/ +/* Fix Coulomb or Lorentz gauge by doing successive SU(2) gauge hits */ +/* Uses double precision global sums */ +/* This version does automatic reunitarization at preset intervals */ +/* MIMD version 6 */ +/* C. DeTar 10-22-90 */ +/* T. DeGrand 1993 */ +/* U.M. Heller 8-31-95 */ +/* C. DeTar 10-11-97 converted to generic */ +/* C. DeTar 12-26-97 added automatic reunitarization */ +/* C. DeTar 11-24-98 remove superfluous references to p2 (was for ks phases) */ + +/* Heavily modified by Kari Rummukainen 2005-6 */ + +/* Prototype... + +void gaugefix(int gauge_dir,double relax_boost,int max_gauge_iter, + double gauge_fix_tol, suN_matrix gauge ); + + if gauge == NULL do not return the gauge + + ------------------------------------------------------------------- + + NOTE: For staggered fermion applications, it is necessary to remove + the KS phases from the gauge links before calling this procedure. + See "rephase" in setup.c. + + ------------------------------------------------------------------- + EXAMPLE: Fixing only the link matrices to Coulomb gauge with scratch + space in mp (suN_matrix) and chi (suN_vector): + + gaugefix(TUP,1.5,500,1.0e-7,NULL); + + ------------------------------------------------------------------- + EXAMPLE: Fixing Coulomb gauge with respect to the y direction + in the staggered fermion scheme and simultaneously transforming + the pseudofermion fields and gauge-momenta involved in updating: + + rephase( OFF ); + gauge = new_latfield(suN_matrix); + gaugefix( YUP, 1.8, 500, 2.0e-6, gauge ); + vec_fix_gauge( gauge, g_rand, EVENODD ); + vec_fix_gauge( gauge, phi, EVEN ); + vec_fix_gauge( gauge, xxx, EVEN ); + free_latfield( gauge ); + rephase( ON ); + + ------------------------------------------------------------------- + + gauge_dir specifies the direction of the "time"-like hyperplane + for the purposes of defining Coulomb or Lorentz gauge + TUP for evaluating propagators in the time-like direction + ZUP for screening lengths. + -1 for Lorentz gauge + relax_boost Overrelaxation parameter + max_gauge_iter Maximum number of iterations + gauge_fix_tol Stop if change is less than this +*/ + +#include "lattice.h" +#define REUNIT_INTERVAL 50 + +#ifdef SU2 +DOES NOT WORK YET FOR SU2 +#endif + +typedef struct { complex e[2][2]; } su2_matrix; +#define Ncol 3 + +/* CDIF(a,b) a -= b */ + /* a -= b */ +#define CDIF(a,b) { (a).real -= (b).real; (a).imag -= (b).imag; } + +/* Useful routines, will be relocated */ + +void mult_su2_mat_vec_elem_a(su2_matrix *u,complex *x0,complex *x1) +{ + /* Multiplies the complex row spinor (x0, x1) by the adjoint of the */ + /* SU(2) matrix u and puts the result in (x0,x1). */ + /* Thus x <- x * u-adj */ + /* C. DeTar 3 Oct 1990 */ + + complex z0, z1, t0, t1; + + t0 = *x0; t1 = *x1; + + CMUL_J(t0, u->e[0][0], z0); + CMUL_J(t1, u->e[0][1], z1); + CADD(z0, z1, *x0); + CMUL_J(t0, u->e[1][0], z0); + CMUL_J(t1, u->e[1][1], z1); + CADD(z0, z1, *x1); + +} /* m_su2_mat_vec_a.c */ + + +void right_su2_hit_a(su2_matrix *u,int p,int q,su3_matrix *link) +{ + /* link <- link * u adj */ + /* The 0 column of u-adjoint matches column p of the SU(3) matrix */ + /* The 1 column of u-adjoint matches column q of the SU(3) matrix */ + /* C. DeTar 18 Oct 1990 */ + + register int m; + + for (m = 0; m < Ncol; m++) + mult_su2_mat_vec_elem_a(u, &(link->e[m][p]), &(link->e[m][q])); + +} /* r_su2_hit_a.c */ + + +/* Scratch space */ + +void accum_gauge_hit(int i, int gauge_dir, + su3_matrix *diffmat, su3_vector *sumvec ) +{ + + /* Accumulates sums and differences of link matrices for determining optimum */ + /* hit for gauge fixing */ + /* Differences are kept in diffmat and the diagonal elements of the sums */ + /* in sumvec */ + + register int j; + register su3_matrix *m1; + register int dir; + + /* Clear sumvec and diffmat */ + + clear_su3mat( diffmat ); + clearvec( sumvec ); + + /* Subtract upward link contributions */ + + foralldir(dir) if (dir != gauge_dir) { + int odir = opp_dir(dir); + + m1 = &(U[dir][i]); + sub_su3_matrix( diffmat, m1, diffmat); + /* Sum diagonal part */ + for(j=0; jc[j], m1->e[j][j] ); + + + /* Add downward link contributions */ + + m1 = &U[dir][nb(odir,i)]; + add_su3_matrix( diffmat, m1, diffmat ); + for(j=0; jc[j], m1->e[j][j] ); + } +} /* accum_gauge_hit */ + + +void do_hit(int gauge_dir, int parity, double relax_boost, su3_matrix *gauge ) +{ + /* Do optimum SU(2) gauge hit for p, q subspace */ + + double a0,a1,a2,a3,asq,a0sq,x,r,xdr; + int dir,i,p,q; + su2_matrix u; + su3_matrix diffmat; + su3_vector sumvec; + + /* Accumulate sums for determining optimum gauge hit - + * U's must have been fetched from down! */ + + /* accum_gauge_hit( gauge_dir, parity, diffmat, sumvec); */ + + forparity(i,parity) for (p=0; p X |-- uf --> af + * + * 2 beta tr (a [u an u']) + * + * Note that [ ] is also 'adjoint!' + */ + + register int i,dir, odir, otherparity; + msg_tag *tag0,*tag1; + su3_matrix tmat1,tmat2; + adjoint_matrix tadj; + + /* Loop over directions, computing force from links */ + + otherparity = opp_parity( parity ); + foralldir(dir) { + odir = opp_dir(dir); + + /* start gather of up-adjoint link */ + + tag0 = start_get( ahiggs, dir, parity ); + + /* multiply adjoint here with up-link, for opp-parity */ + forparity(i,otherparity) { + prefetch_matrix( &U[dir][i+1] ); + prefetch_adjoint( &ahiggs[i+1] ); + uncompress_adjmat(&ahiggs[i],&tmat1); + mult_su3_an( &U[dir][i], &tmat1, &tmat2 ); + mult_su3_nn( &tmat2, &U[dir][i], &tmat1 ); + compress_adjmat( &tmat1, &adjstaple[i] ); + } + + tag1 = start_get( adjstaple, odir, parity ); + + wait_get(tag0); + + /* multiply link with up-adjoint */ + if (dir == XUP) forparity(i,parity) { + prefetch_matrix( &U[dir][i+1] ); + prefetch_adjoint( &ahiggs[nb(dir,i+1)] ); + uncompress_adjmat( &ahiggs[nb(dir,i)], &tmat1); + mult_su3_nn( &U[dir][i], &tmat1, &tmat2 ); + mult_su3_na( &tmat2, &U[dir][i], &tmat1 ); + compress_adjmat( &tmat1, &adjstaple[i] ); + } else forparity(i,parity) { + prefetch_matrix( &U[dir][i+1] ); + prefetch_adjoint( &ahiggs[nb(dir,i+1)] ); + prefetch_adjoint( &adjstaple[i+1] ); + uncompress_adjmat( &ahiggs[nb(dir,i)], &tmat1); + mult_su3_nn( &U[dir][i], &tmat1, &tmat2 ); + mult_su3_na( &tmat2, &U[dir][i], &tmat1 ); + compress_adjmat( &tmat1, &tadj ); + add_adjmat( &adjstaple[i], &tadj, &adjstaple[i] ); + } + + wait_get(tag1); + forparity(i,parity) { + prefetch_adjoint( &adjstaple[i+1] ); + prefetch_adjoint( &adjstaple[nb(odir,i+1)] ); + add_adjmat( &adjstaple[i], &adjstaple[nb(odir,i)] , &adjstaple[i] ); + } + } +} + + +/****************************************************** + * needed for adjoint acceptance + * note-first su3_matrix contains the 'local' adjoint + * matrix uncompressed + * + * acc/rej with + * + * a |-- u --> an + * + * -2 beta tr (a u an u') + * + *****************************************************/ + +double act_gauge_adj(su3_matrix *a, su3_matrix *u,adjoint_matrix *b) +{ + su3_matrix am,m1,m2; + + uncompress_adjmat(b,&am); + mult_su3_nn(a,u,&m1); + mult_su3_na(u,&am,&m2); + return((-2.0)*betaA*realtrace_su3(&m1,&m2)); /* m2' * m1 */ +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/io_lattice.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/io_lattice.c new file mode 100644 index 0000000000000000000000000000000000000000..2f4dab17f12f06c182359d71e4ba7dae3d6ac208 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/io_lattice.c @@ -0,0 +1,46 @@ +/*********************** io_lattice.c *************************/ +/* This reads and writes a (binary) lattice + * + * THIS IS AN ENCAPSULATING FILE TO + * ../generic/io_lattice_generic.c + * HERE WE HAVE TO DEFINE + * + * typedef struct { } allfields; + * + * copy_fields(int site, allfields *s) copy from all latfields to s.(whatever) + * set_fields(allfields *s, int site) copy s.(stuff) to lattice fields + * + * #include "../generic/io_lattice_generic.c" + */ + +#include "lattice.h" + +typedef struct { + su3_matrix link[NDIM]; +#ifdef HIGGS + adjoint_matrix ahiggs; +#endif +} allfields; + +void set_fields( allfields *s, int i ) +{ + int dir; + + foralldir(dir) U[dir][i] = s->link[dir]; +#ifdef HIGGS + ahiggs[i] = s->ahiggs; +#endif +} + +void copy_fields( int i, allfields *s ) +{ + int dir; + + foralldir(dir) s->link[dir] = U[dir][i]; +#ifdef HIGGS + s->ahiggs = ahiggs[i]; +#endif +} + +#include "../generic/io_lattice_generic.c" + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/lattice.h b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/lattice.h new file mode 100644 index 0000000000000000000000000000000000000000..62708d36fcb7100d5b1e029cb2caf2b5b71c17f0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/lattice.h @@ -0,0 +1,128 @@ +/****************************** lattice.h ********************************/ + +/* include file for SU3-adjoint Higgs program, version 2 + This file defines global scalars and the fields in the lattice. */ + +/* #define check */ + +#ifdef CONTROL +#define EXTERN +#else +#define EXTERN extern +#endif + +#define PI 3.14159265358979323846 +#define pi PI +#define pi2 (PI*2.0) + +#include +#include +#include +#include "complex.h" +#include "su3.h" +#include "comdefs.h" +#include "generic.h" +#include "generic_su3.h" + +#ifndef check +#define check_action(a) /* nothing */ +#endif + +#define MAX_BOP 5 /* max number of blockings */ + +/* The following are global scalars */ +EXTERN long seed; /* random number seed */ +EXTERN int mc_steps,n_measurement,n_save; +EXTERN int n_iteration,n_thermal,iteration; +EXTERN double betag; +#ifdef HIGGS +EXTERN double p_x,p_y,betaA,beta4,beta2,betay; +EXTERN int n_correlation,w_correlation; +#endif + +EXTERN double wvalue; /*for multicanonical */ +EXTERN double timeu,timea,timerest; +EXTERN double ahitu,ahitua,ahithb,ahitax,ahitmc,ahitog; /* hit*/ +EXTERN int nhitu,nhitua,nhithb,nhitax,nhitmc,nhitog; +EXTERN int meas_sync,corr_sync; + +#ifdef HIGGS +/* correlation function globals */ +EXTERN int corrlen,n_corr; +EXTERN int n_bop,n_blocking,b_level[MAX_BOP]; + +/* correlation function pointers */ +#define N_CORR 8 +EXTERN float *c_array; +EXTERN float *cr2[MAX_BOP],*cr3[MAX_BOP],*ch0[MAX_BOP],*ch1[MAX_BOP]; +EXTERN float *cH0[MAX_BOP],*cH1[MAX_BOP],*cp0[MAX_BOP],*cp1[MAX_BOP]; + +#define b_const_a1 0.2 +#define b_const_a2 (0.25*(1.0-b_const_a1)) +#define b_const_g1 0.334 +#define b_const_g2 (0.5*(1.0-b_const_g1)) + +#endif + +/***************************************************************** + * Field variables + */ + +EXTERN su3_matrix *U[NDIM]; +#ifdef HIGGS +EXTERN adjoint_matrix *ahiggs; +#endif + +/*****************************************************************/ + +#define confname "config" + +/* PABS replace status by kernel_B.input.status */ +#define statname "kernel_B.input.status" + +#define measurename "measure" +#define corrname "correl" +#define wlname "wloop" + +/* PABS replace beta by kernel_B.input.beta */ +#define betaname "kernel_B.input.beta" + +#define weightname "weight" + +/* PABS replace parameters by kernel_B.input.parameters */ +#define paramname "kernel_B.input.parameters" + +#ifndef T3E +#define prefetch_adjoint(x) /* nothing */ +#define prefetch_matrix(x) /* nothing */ +#endif + +void reunitarize(su3_matrix *link[NDIM]); +int setup(void); +void load_config(int status); +void updatehiggs(int isover); +void measure(); void writemeas(); void hcorr(); void writecorr(); +void setfiles(int restart); +void dumpall(int status,int * maxiters); +void updategauge(int isrelax); +void relax(int dir, int parity, su3_matrix *link[NDIM], su3_matrix *staple +#ifdef HIGGS + , su3_matrix *ac +#endif + ); +void monte(int dir, int parity, su3_matrix *link[NDIM], su3_matrix *staple +#ifdef HIGGS + , su3_matrix *ac +#endif + ); +void staples_su3(su3_matrix *link[NDIM], su3_matrix *staple, int dir1,int parity); +double Xoverrelax(int parity, adjoint_matrix *ahiggs, adjoint_matrix *astaple); +double HBHiggs(int parity, adjoint_matrix *ahiggs, adjoint_matrix *astaple); +double act_gauge_adj(su3_matrix *a, su3_matrix *u,adjoint_matrix *b); + +complex measure_ploop(su3_matrix *link[NDIM], int dir); + +void staple1(int i, int dir1, MATRIX *link[NDIM], MATRIX *staple) ; + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/measure.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/measure.c new file mode 100644 index 0000000000000000000000000000000000000000..074dff478a1785dc9a31bd0810145d9d9cc4de2f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/measure.c @@ -0,0 +1,458 @@ +/**************************************************************** + * MIMD SU3 - adjoint Higgs * + * * + * Measurement routines * + * Kari Rummukainen 1992 - 1996 * + * * + ***************************************************************/ + +#include "lattice.h" /* global variables for lattice fields */ + + +typedef struct { + int headerid,headersize; + int n_double,n_long,n_float,n_char; + int lx,ly,lz,lt; + int d1,d2,d3,d4,d5,d6,d7,d8; +} e_header; +#define E_HEADER_ID 91919191 + +static FILE *measfile, *corrfile; + +#ifdef HIGGS +#define m_plaq 1 +#define m_hopp 2 +#define m_a 3 +#define m_a2 4 +#define m_a3 5 +#define m_a4 6 +#define m_acty 7 +#define N_MEAS 8 +#elif NDIM == 3 +#define m_plaq 0 +#define N_MEAS 1 +#elif NDIM == 4 +#define m_plaq_s 0 +#define m_plaq_t 1 +#define ploop_r 2 +#define ploop_i 3 +#define N_MEAS 4 +#endif +double ma[N_MEAS]; + +void measure_plaq(); +void measure_higgs(); +int reposition(FILE *f,int nmeas); + +void +measure() +{ + int i; + complex ct; + + for (i=0; i F + * + * action is -1/2 beta Tr(X' UF) = -1/2 beta Tr(X F'U') + * + * Multiply and add RF*VF'*UF' + * + * GF: Now B,X,F = 1; + */ + + + msg_tag *tag[NDIM]; + int i,j,dir; + double r2; + su3_matrix a1,a2; + + /* start gathers of points up */ + foralldir(dir) tag[dir] = start_get( ahiggs, dir, EVENODD ); + + forallsites_waitA(i,tag,NDIM) { + uncompress_adjmat( &ahiggs[i], &a1); + + foralldir(dir) + ma[m_hopp] += act_gauge_adj(&a1, &U[dir][i], &ahiggs[nb(dir,i)]); + + for (r2=j=0; j<8; j++) r2 += sqr( ahiggs[i].l[j] ); + + ma[m_a2] += r2; + ma[m_a4] += r2*r2; + + ma[m_a] += sqrt(r2); + + /* and then A0^3 -- + * NOTE: now Tr(A^3)_cont = Tr(A^3)_latt * 2^(3/2) + * and below it is also divided by 3! + */ + mult_su3_nn( &a1, &a1, &a2); + ma[m_a3] += realtrace_su3( &a2, &a1); /* this is a2' * a1, but a2 + is hermitean */ + } + + ma[m_acty] = ma[m_a2] * betay; + + ma[m_hopp] /= 3*lattice.volume*(2.0*betaA); + ma[m_a] /= lattice.volume; + ma[m_a2] /= lattice.volume; + ma[m_a3] /= 3*lattice.volume; + ma[m_a4] /= lattice.volume; +} + +#endif + +/************************************************************ + * write measurements + */ + +void +writemeas() +{ + int i; + + /* sum it */ + g_vecdoublesum(ma, N_MEAS, 0); + + if (this_node == 0) { +#ifdef HIGGS + if (is_multicanonical ) ma[0] = multi_weight(); +#endif + + meas_sync++; + i = (fwrite(ma,sizeof(double),N_MEAS,measfile) == N_MEAS); + if (i) i = (fwrite(&meas_sync,sizeof(int),1,measfile) == 1); + + if (!i) halt("Could not write measurement file"); + + if (meas_sync % 100 == 0) fflush(measfile); + } +} + +#ifdef HIGGS + +/************************************************************ + * write correlations + */ + +void +writecorr() +{ + int i,j; + + corr_sync++; + + if (this_node == 0) { + for (j=0; j 0) { + if (this_node == 0) fil = fopen(confname,"w"); + t = cputime(); + save_binary(fil); + } + + if(this_node == 0){ + printf("+"); + pm++; if (pm >= 20) { printf(" iteration %d\n",iteration); pm = 0; } + fflush(stdout); + if (n_save > 0) fclose(fil); + } + + if (this_node == 0) fil = fopen(statname,"r"); + restart = get_i(fil,"restart",-1); + nn_i = get_i(fil,"n_iteration",-1); + nn_t = get_i(fil,"n_thermal",-1); + if ((nn_i != n_iteration || nn_t != n_thermal)) { + printf0(" -> New limits:thermal %d, work %d\n",nn_t,nn_i); + n_iteration = nn_i; + n_thermal = nn_t; + + if (status == 1) *maxiters = n_thermal; + else *maxiters = n_iteration; + } + + if (this_node == 0) fclose(fil); + + if (this_node == 0) { + + /* flush the files .. */ + /* if (fflush(measfile) != 0) halt(" FILE ERROR when flushing measurements"); + */ + /* if (fflush(corrfil) != 0) halt(" FILE ERROR when flushing correlations"); + */ + + fil = fopen(statname,"w"); + iseed = dran()*(1<<30); + + print_i(fil,"restart",1); /* write now restart */ + print_i(fil,"n_iteration",n_iteration); + print_i(fil,"n_thermal",n_thermal); + print_i(fil,"seed",iseed); + + print_i(fil,"run status",status); + print_i(fil,"iteration",iteration); + print_d(fil,"time: gauge",timeu); +#ifdef HIGGS + print_d(fil,"time: higgs",timea); +#endif + print_d(fil,"time: rest",timerest); + + fclose(fil); + +#ifdef HIGGS + if (is_mucacalc) writemuca(); +#endif + + fflush(stdout); + } +} + +#ifdef check + +/**************************************************************** + * * + * check the terms ... diagnostic routine * + * * + ***************************************************************/ + +static double car[N_MEAS],car2[N_MEAS]; +static int ii=0; + +int +check_action(int stat) +{ + static double arr[N_MEAS]; + int i; + + measure(); + if (stat == 0) { + for (i=0; i X |-- uf --> af + * + * 2 beta tr (a [u an u']) + * + * Note that [ ] is also 'adjoint!' + */ + + register int i,dir, odir, otherparity; + msg_tag *tag0,*tag1; + su3_matrix tmat1,tmat2; + adjoint_matrix tadj; + + /* Loop over directions, computing force from links */ + + otherparity = opp_parity( parity ); + foralldir(dir) { + odir = opp_dir(dir); + + /* start gather of up-adjoint link */ + + tag0 = start_get( ahiggs, dir, parity ); + + /* multiply adjoint here with up-link, for opp-parity */ + forparity(i,otherparity) { + prefetch_matrix( &U[dir][i+1] ); + prefetch_adjoint( &ahiggs[i+1] ); + uncompress_adjmat(&ahiggs[i],&tmat1); + mult_su3_an( &U[dir][i], &tmat1, &tmat2 ); + mult_su3_nn( &tmat2, &U[dir][i], &tmat1 ); + compress_adjmat( &tmat1, &adjstaple[i] ); + } + + tag1 = start_get( adjstaple, odir, parity ); + + wait_get(tag0); + + /* multiply link with up-adjoint */ + if (dir == XUP) forparity(i,parity) { + prefetch_matrix( &U[dir][i+1] ); + prefetch_adjoint( &ahiggs[nb(dir,i+1)] ); + uncompress_adjmat( &ahiggs[nb(dir,i)], &tmat1); + mult_su3_nn( &U[dir][i], &tmat1, &tmat2 ); + mult_su3_na( &tmat2, &U[dir][i], &tmat1 ); + compress_adjmat( &tmat1, &adjstaple[i] ); + } else forparity(i,parity) { + prefetch_matrix( &U[dir][i+1] ); + prefetch_adjoint( &ahiggs[nb(dir,i+1)] ); + prefetch_adjoint( &adjstaple[i+1] ); + uncompress_adjmat( &ahiggs[nb(dir,i)], &tmat1); + mult_su3_nn( &U[dir][i], &tmat1, &tmat2 ); + mult_su3_na( &tmat2, &U[dir][i], &tmat1 ); + compress_adjmat( &tmat1, &tadj ); + add_adjmat( &adjstaple[i], &tadj, &adjstaple[i] ); + } + + wait_get(tag1); + forparity(i,parity) { + prefetch_adjoint( &adjstaple[i+1] ); + prefetch_adjoint( &adjstaple[nb(odir,i+1)] ); + add_adjmat( &adjstaple[i], &adjstaple[nb(odir,i)] , &adjstaple[i] ); + } + } +} + + +/****************************************************** + * needed for adjoint acceptance + * note-first su3_matrix contains the 'local' adjoint + * matrix uncompressed + * + * acc/rej with + * + * a |-- u --> an + * + * -2 beta tr (a u an u') + * + *****************************************************/ + +double act_gauge_adj(su3_matrix *a, su3_matrix *u,adjoint_matrix *b) +{ + su3_matrix am,m1,m2; + + uncompress_adjmat(b,&am); + mult_su3_nn(a,u,&m1); + mult_su3_na(u,&am,&m2); + return((-2.0)*betaA*realtrace_su3(&m1,&m2)); /* m2' * m1 */ +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/monte.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/monte.c new file mode 100644 index 0000000000000000000000000000000000000000..d08ab03eeb39d6d21af5f03f2bdc792d4741dd17 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/monte.c @@ -0,0 +1,302 @@ +/************************** monte.c *******************************/ +/* Kennedy-Pendleton quasi heat bath on SU(2) subgroups */ +/* MIMD version 3 */ +/* T. DeGrand March 1991 */ +/* modified by K.R 97 & 2002 */ + +#include "lattice.h" + +#define Nc 3 + +/* Generic definitions - could be useful elsewhere */ + +typedef struct { complex e[2][2]; } su2_matrix; + +/* #pragma inline ( mult_su2_mat_vec_elem_n, left_su2_hit_n ) */ + +INLINE void mult_su2_mat_vec_elem_n(u,x0,x1) + su2_matrix *u; + complex *x0, *x1; +{ + /* Multiplies the complex column spinor (x0, x1) by the SU(2) matrix u */ + /* and puts the result in (x0,x1). */ + /* Thus x <- u * x */ + /* C. DeTar 3 Oct 1990 */ + + complex z0, z1, t0, t1; + + t0 = *x0; t1 = *x1; + + CMUL(u->e[0][0], t0, z0); + CMUL(u->e[0][1], t1, z1); + CADD(z0, z1, *x0); + CMUL(u->e[1][0], t0, z0); + CMUL(u->e[1][1], t1, z1); + CADD(z0, z1, *x1); + +} /* mult_su2_mat_vec_elem_n */ + + + +/* void dumpsu2(u) su2_matrix *u; { + * int i,j; + * for(i=0;i<2;i++){ + * for(j=0;j<2;j++)printf("(%.2e,%.2e)\t", + * (double)u->e[i][j].real,(double)u->e[i][j].imag); + * printf("\n"); + * } + * printf("\n"); + *} + */ + +INLINE void left_su2_hit_n(su2_matrix *u, int p, int q, su3_matrix *link) +{ + /* link <- u * link */ + /* The 0 row of the SU(2) matrix u matches row p of the SU(3) matrix */ + /* The 1 row of the SU(2) matrix u matches row q of the SU(3) matrix */ + /* C. DeTar 18 Oct 1990 */ + + register int m; + + for (m = 0; m < 3; m++) + mult_su2_mat_vec_elem_n(u, &(link->e[p][m]), &(link->e[q][m])); + +} /* left_su2_hit_n */ + + + +void monte(int dir,int parity, + su3_matrix *link[NDIM], su3_matrix *staple +#ifdef HIGGS + , su3_matrix *a_uncmpr +#endif + ) +{ + /* Do K-P quasi-heat bath by SU(2) subgroups */ + int Nhit, index1, ina, inb,ii,cb; + int gahit,gatry,utry,uhit; + double xr1,xr2,xr3,xr4; + double a0,a1,a2,a3; + double v0,v1,v2,v3, vsq; + double h0,h1,h2,h3; + double r,r2,rho,z; + double al,d, xl,xd; + int k, nacd, test; + double b3; + register int i; + su3_matrix action; + su2_matrix h; + + Nhit = 3; + + b3=betag/3.0; + + gahit = gatry = 0; utry = uhit = 1; + + /* now for the qhb updating */ + for(index1=0;index1 inb) { ii=ina; ina=inb; inb=ii;} + + forparity(i,parity){ + mult_su3_na( &link[dir][i], &staple[i], &action ); + + /* decompose the action into SU(2) subgroups using + * Pauli matrix expansion + * The SU(2) hit matrix is represented as + * a0 + i * Sum j (sigma j * aj) + */ + v0 = action.e[ina][ina].real + action.e[inb][inb].real; + v3 = action.e[ina][ina].imag - action.e[inb][inb].imag; + v1 = action.e[ina][inb].imag + action.e[inb][ina].imag; + v2 = action.e[ina][inb].real - action.e[inb][ina].real; + + vsq = v0*v0 + v1*v1 + v2*v2 + v3*v3; + + if (vsq <= 0.0) { + printf("monte: vsq error! node %d, vsq %g\n",this_node,vsq); + fflush(stdout); + terminate(0); + } + + z = sqrt(vsq ); + /* Normalize u */ + v0 = v0/z; v1 = v1/z; v2 = v2/z; v3 = v3/z; + /* end norm check--trial SU(2) matrix is a0 + i a(j)sigma(j)*/ +/* test +if(this_node == 0)printf("v= %e %e %e %e\n",v0,v1,v2,v3); +if(this_node == 0)printf("z= %e\n",z); +*/ + /* now begin qhb */ + /* get four random numbers */ + + xr1 = log(1.0 - dran()); + xr2 = log(1.0 - dran()); + xr3 = dran(); + xr4 = dran(); + + xr3 = cos(pi2*xr3); + +/* + if(this_node == 0)printf("rand= %e %e %e %e\n",xr1,xr2,xr3,xr4); +*/ + + /* + generate a0 component of su3 matrix + + first consider generating an su(2) matrix h + according to exp(bg/3 * re tr(h*s)) + rewrite re tr(h*s) as re tr(h*v)z where v is + an su(2) matrix and z is a real normalization constant + let v = z*v. (z is 2*xi in k-p notation) + v is represented in the form v(0) + i*sig*v (sig are pauli) + v(0) and vector v are real + + let a = h*v and now generate a + rewrite beta/3 * re tr(h*v) * z as al*a0 + a0 has prob(a0) = n0 * sqrt(1 - a0**2) * exp(al * a0) + */ + al = b3*z; +/*if(this_node == 0)printf("al= %e\n",al);*/ + + /* + let a0 = 1 - del**2 + get d = del**2 + such that prob2(del) = n1 * del**2 * exp(-al*del**2) + */ + + d = -(xr2 + xr1*xr3*xr3)/al; + + /* monte carlo prob1(del) = n2 * sqrt(1 - 0.5*del**2) + then prob(a0) = n3 * prob1(a0)*prob2(a0) + */ + + /* now beat each site into submission */ + nacd = 0; + if ((1.00 - 0.5*d) > xr4*xr4) nacd=1; + if(nacd == 0 && al > 2.0) { /* k-p algorithm */ + test=0; + for(k=0; k<20 && !test;k++) { + /* get four random numbers */ + xr1 = log(1.0 - dran()); + xr2 = log(1.0 - dran()); + xr3 = dran(); + xr4 = dran(); + + xr3 = cos(pi2*xr3); + + d = -(xr2 + xr1*xr3*xr3)/al; + if ((1.00 - 0.5*d) > xr4*xr4) test = 1; + } + utry += k; + uhit++; + + if (this_node == 0 && test != 1) + printf("site took 20 kp hits\n"); + } /* endif nacd */ + + if(nacd == 0 && al <= 2.0) { + /* creutz algorithm */ + xl=exp((double)(-2.0*al)); + xd= 1.0 - xl; + test=0; + for(k=0;k<20 && test == 0 ;k++) { + /* get two random numbers */ + xr1=dran(); + xr2=dran(); + + r = xl + xd*xr1; + a0 = 1.00 + log((double)r)/al; + if((1.0 -a0*a0) > xr2*xr2) test = 1; + } + d = 1.0 - a0; + utry += k; + uhit++; + + if(this_node == 0 && test !=1) + printf("site took 20 creutz hits\n"); + } /* endif nacd */ + + /* generate full su(2) matrix and update link matrix*/ + + /* find a0 = 1 - d*/ + a0 = 1.0 - d; + /* compute r */ + r2 = 1.0 - a0*a0; + r2 = fabs(r2); + r = sqrt(r2); + + /* compute a3 */ + a3=(2.0*dran() - 1.0)*r; + + prefetch_matrix(&a_uncmpr[i] ); + prefetch_adjoint(&ahiggs[nb(dir,i)]); + + /* compute a1 and a2 */ + rho = r2 - a3*a3; + rho = fabs(rho); + rho = sqrt(rho); + + /*xr2 is a random number between 0 and 2*pi */ + xr2 = pi2*dran(); + a1 = rho*cos((double)xr2); + a2 = rho*sin((double)xr2); + + /* now do the updating. h = a*v^dagger, new u = h*u */ + h0 = a0*v0 + a1*v1 + a2*v2 + a3*v3; + h1 = a1*v0 - a0*v1 + a2*v3 - a3*v2; + h2 = a2*v0 - a0*v2 + a3*v1 - a1*v3; + h3 = a3*v0 - a0*v3 + a1*v2 - a2*v1; + + /* Elements of SU(2) matrix */ + + h.e[0][0] = cmplx( h0, h3); + h.e[0][1] = cmplx( h2, h1); + h.e[1][0] = cmplx(-h2, h1); + h.e[1][1] = cmplx( h0,-h3); + + /* update the link */ + +#ifdef HIGGS + action = link[dir][i]; + left_su2_hit_n(&h,ina,inb,&action); + + /* remember: tmpmat contains uncompressed local adj. */ + + a1 = act_gauge_adj(&a_uncmpr[i],&link[dir][i],&ahiggs[nb(dir,i)]); + a2 = act_gauge_adj(&a_uncmpr[i],&action, &ahiggs[nb(dir,i)]); + + prefetch_matrix(&link[dir][i+1]); + prefetch_matrix(&staple[i+1]); + + if (exp(a1-a2) >= dran()) { + link[dir][i] = action; + gahit++; + } + gatry++; + +#else + + left_su2_hit_n(&h,ina,inb,&link[dir][i]); + +#endif + + } /* site */ + } /* hits */ + +#ifdef HIGGS + nhitua++; + ahitua += 1.0*gahit/gatry; +#endif + + nhitu++; + ahitu += 1.0*uhit/utry; + +} /* monte */ + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/multican.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/multican.c new file mode 100644 index 0000000000000000000000000000000000000000..fb77cc8d41878557aab8c99bb3b1204bdd52103d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/multican.c @@ -0,0 +1,11 @@ +/******** setup.c *********/ +/* MIMD code version 3 */ + +#include "lattice.h" + +typedef adjoint_matrix multi_type; /* defines type of MC field */ +#define multi_field ahiggs /* defines MC field */ +#define multi_order(i) adj_sqr( &ahiggs[i] ) /* defines MC order param. */ + +#include "../generic/multican_generic.c" + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/ploop.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/ploop.c new file mode 100644 index 0000000000000000000000000000000000000000..d9c1f38a3ae8f0f10e60260303b20c0bd7b4d333 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/ploop.c @@ -0,0 +1,43 @@ +/**************************************************************** + * MIMD SU3 + * * + * Measure the polyakov loop + * * + ***************************************************************/ + +#include "lattice.h" /* global variables for lattice fields */ + +complex measure_ploop(su3_matrix *U[NDIM], int dir) +{ + su3_matrix ploop,tm; + int i,j,loc,nt; + complex sum,ct; + + /** THIS SIMPLE VERSION WORKS ONLY IF THE dir-DIRECTION + * FITS COMPLETELY WITHIN ONE NODE. THUS, + */ + + sum = cmplx(0.0,0.0); + + if (node.nodesize[dir] != lattice.size[dir]) + halt(" PLOOP:: lattice size error!"); + + nt = lattice.size[dir]; + + /* Now, multiply all dir-links */ + forallsites(i) if (coordinate(i,dir) == 0) { + ploop = U[dir][i]; + loc = i; + for (j=1; je[m][p]), &(link->e[m][q])); + +} /* r_su2_hit_a.c */ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/relax.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/relax.c new file mode 100644 index 0000000000000000000000000000000000000000..d53d34bf788cfebadf7b184821c4216f7d7af2c8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/relax.c @@ -0,0 +1,109 @@ +/************************** relax.c *******************************/ +/* Microcanonical overrelaxation by doing successive SU(2) gauge hits */ +/* MIMD version 3 */ +/* T. DeGrand March 1991 */ +/* Heavily modified K.R. 97 & 2002 */ + +#include "lattice.h" + +#define Nc 3 + +#ifdef T3E +#define prefetch +#endif + +typedef struct { complex e[2][2]; } su2_matrix; + +/* Codes for interpreting selection of gauge fixing options */ + +void left_su2_hit_n(su2_matrix *u, int p, int q, su3_matrix *link); + +void relax(int dir,int parity, + su3_matrix *link[NDIM], su3_matrix *staple +#ifdef HIGGS + , su3_matrix *a_uncmpr +#endif + ) +{ + /* Do overrelaxation by SU(2) subgroups */ + int Nhit,index1, ina, inb,ii; + int gahit,gatry; + double a0,a1,a2,a3,asq,r; + register int i; + su3_matrix action; + su2_matrix u; + + Nhit = 3; + + gahit = gatry = 0; + + /* now for the overrelaxed updating */ + forparity(i,parity) { + prefetch_matrix(&a_uncmpr[i]); + prefetch_matrix(&(link[dir][i+1])); + prefetch_matrix(&(staple[i+1])); + + for(index1=0;index1 inb) { ii=ina; ina=inb; inb=ii;} + + mult_su3_na( &(link[dir][i]), &(staple[i]), &action ); + + /* decompose the action into SU(2) subgroups using Pauli matrix + * expansion + * The SU(2) hit matrix is represented as + * a0 + i * Sum j (sigma j * aj) + */ + a0 = action.e[ina][ina].real + action.e[inb][inb].real; + a3 = action.e[ina][ina].imag - action.e[inb][inb].imag; + a1 = action.e[ina][inb].imag + action.e[inb][ina].imag; + a2 = action.e[ina][inb].real - action.e[inb][ina].real; + + /* Normalize and complex conjugate u */ + asq = a0*a0 + a1*a1 + a2*a2 + a3*a3; + r = sqrt( asq ); + a0 = a0/r; a1 = -a1/r; a2 = -a2/r; a3 = -a3/r; + /* Elements of SU(2) matrix */ + + u.e[0][0] = cmplx( a0, a3); + u.e[0][1] = cmplx( a2, a1); + u.e[1][0] = cmplx(-a2, a1); + u.e[1][1] = cmplx( a0,-a3); + + /* Do SU(2) hit on all links twice (to overrelax) */ + +#ifdef HIGGS + + action = link[dir][i]; + left_su2_hit_n(&u,ina,inb,&action); + left_su2_hit_n(&u,ina,inb,&action); + + /* remember: tmpmat contains uncompressed local adj. */ + a1 = act_gauge_adj(&a_uncmpr[i], &link[dir][i], &ahiggs[nb(dir,i)]); + a2 = act_gauge_adj(&a_uncmpr[i], &action, &ahiggs[nb(dir,i)]); + + if (exp(a1-a2) >= dran()) { + link[dir][i] = action; + gahit++; + } + gatry++; + +#else + + left_su2_hit_n(&u,ina,inb,&link[dir][i]); + left_su2_hit_n(&u,ina,inb,&link[dir][i]); + +#endif + + } /* st */ + } /* hits */ + +#ifdef HIGGS + nhitua++; + ahitua += 1.0*gahit/gatry; +#endif + +} /* relax */ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/setcouplings_gauge.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/setcouplings_gauge.c new file mode 100644 index 0000000000000000000000000000000000000000..7a88952e184fdd62d222784af489b33dee09d557 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/setcouplings_gauge.c @@ -0,0 +1,15 @@ +/******** setcouplings_gauge.c *********/ +/* MIMD code version 3 */ + +#include "lattice.h" + +/* Each node has a params structure for passing simulation parameters */ + + +void setcouplings() +{ + if (this_node == 0) + printf(" Input couplings: betag %.8g\n",betag); + +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/setcouplings_higgs.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/setcouplings_higgs.c new file mode 100644 index 0000000000000000000000000000000000000000..64e0e0a7afa549b3f7ad398148d831481fbde9cb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/setcouplings_higgs.c @@ -0,0 +1,62 @@ +/******** setcouplings_higgs.c *********/ +/* MIMD code version 3 */ + +#include "lattice.h" + +/* Each node has a params structure for passing simulation parameters */ + +#define Sigma 3.1759115 +/* #define improve_y */ + + + +void setcouplings() +{ + if (this_node == 0) + printf(" Input couplings: betag %.8g x %.8g y %.8g\n",betag,p_x,p_y); + + /* Improved x: from eq. (2.8) in JHEP11 (1998) 011 + */ + +#ifdef improve_y + p_y *= 1 + (2.574608+0.72985*p_x) / betag; +#endif + +#define improve_x +#ifdef improve_x + p_x = p_x + ( 0.328432 - 0.835282 * p_x + 1.167759 * sqr(p_x) ) / betag; + if (this_node == 0) + printf(" USING IMPROVED x\n"); +#else + if (this_node == 0) + printf(" USING NON-IMPROVED x\n"); +#endif + + /* and calculate the lattice couplings: + * NOTE: This normalizes A0 -> sqrt(2) A0, compared to paper + * thus, now h^a h^a = 2 Tr h^2 = Tr A_cont + * AND (h^a h^a)^2 = (2 Tr h^2)^2 = (Tr A_cont)^2 + * + * Now Tr h^3 = 2^(-3/2) Tr A_cont^3 (note also extra 3 in the measurement) + */ + + betaA = 12/betag; + beta4 = p_x * 1.5 * sqr(betaA)/betag; + beta2 = 3*betaA * (1 + 6*p_y/sqr(betag) + - (6 + 10*p_x)*Sigma/(4*pi*betag) + - 6/(16*sqr(pi*betag)) * + ((60*p_x - 20*sqr(p_x))*(log(betag) + 0.08849) + + 34.768*p_x + 36.130)); + + betay = 3*betaA * 6/sqr(betag); + + if (this_node == 0) { + printf(" Non-improved couplings: betag %.8g x %.8g y %.8g\n",betag,p_x,p_y); + printf(" Other lattice couplings: betaA %.8g beta2 %.8g beta4 %.8g\n", + betaA,beta2,beta4); + + printf(" OUTPUT NORMALIZATION: #5 = Tr A0^2, #7 = (Tr A0^2)^2\n"); + + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/setup.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/setup.c new file mode 100644 index 0000000000000000000000000000000000000000..271e21a819aa144528ef7b1581b473824848466e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/setup.c @@ -0,0 +1,236 @@ +/******** setup.c *********/ +/* MIMD code version 3 */ + +#include "lattice.h" + +/* Each node has a params structure for passing simulation parameters */ + +void load_config(int status); +void coldlat(); void hotlat(); +void setcouplings(); + +/* SETUP ROUTINES */ + +int setup() +{ + int i,rstatus,restart,j; + int nx,ny,nz,nt,size[NDIM]; + FILE *fil; + + /* On node zero, read lattice size, seed, nflavors and send to others */ + /* print banner */ + if (this_node == 0) { + printf("--------------------------------------\n"); +#ifdef Higgs + printf("SU3 + adjoint Higgs in %d dimensions\n",NDIM); +#else + printf("SU3 gauge in %d dimensions\n",NDIM); +#endif + printf("Based on MILC MIMD version 3\n"); + printf("Machine = %s, with %d nodes\n",machine_type(),numnodes()); + printf("Overrelaxed/quasi-heat bath algorithm\n"); + + if ((fil = fopen(paramname,"r")) == NULL) halt(" ** No parameter file?"); + printf("\n READING LATTICE SIZE FROM PARAMETER FILE:\n"); + } + + nx = size[XUP] = get_i(fil,"nx",1); + ny = size[YUP] = get_i(fil,"ny",1); + nz = size[ZUP] = get_i(fil,"nz",1); +#if NDIM == 4 + nt = size[TUP] = get_i(fil,"nt",1); +#endif + + /************************************************** + * Initialize the layout and gather functions + */ + setup_lattice(size); + + printf0("\n READING REST OF THE PARAMETER FILE:\n"); + + mc_steps = get_i(fil,"micro steps",1); + n_measurement = get_i(fil,"n_measurement",1); +#ifdef HIGGS + n_correlation = get_i(fil,"n_correlation",1); + w_correlation = get_i(fil,"w_correlation",1); +#endif + n_save = get_i(fil,"n_save",1); + +#ifdef HIGGS + n_blocking = get_i(fil,"blocking levels",1); + for (j=n_bop=0; ja_uncmp */ + forparity(i,parity) { + prefetch_adjoint(ahiggs+2+i); + uncompress_adjmat(&(ahiggs[i]),&(a_uncmpr[i])); + } +#endif + + /* compute the gauge force */ + staples_su3(U, staple, dir, parity); /* goest to ->staple */ + +#ifdef HIGGS + wait_get(tag); /* wait for ahiggs from up */ + if (isrelax) relax(dir, parity, U, staple, a_uncmpr); + else monte(dir, parity, U, staple, a_uncmpr); +#else + if (isrelax) relax(dir, parity, U, staple ); + else monte(dir, parity, U, staple ); +#endif + + } + +#ifdef HIGGS + free_tmp( a_uncmpr ); +#endif + free_tmp( staple ); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/updatehiggs.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/updatehiggs.c new file mode 100644 index 0000000000000000000000000000000000000000..3a6648363101090a6a3ca29c67d478ee3ab9d19a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/updatehiggs.c @@ -0,0 +1,129 @@ +/********************* updatehiggs.c ************************/ + +#include "lattice.h" + +void get_adjstaple(int parity, adjoint_matrix *as); + +void updatehiggs(int relax) +{ + int parity; + double rtot; + adjoint_matrix *adjstaple; + + adjstaple = tmp_latfield( adjoint_matrix ); + + forbothparities(parity) { + + if (is_multicanonical) set_mc_update(parity); + + get_adjstaple(parity, adjstaple); + + /* Here Goverrelax or Xoverrelax */ + if (relax) rtot = Goverrelax(parity, ahiggs, adjstaple); + else rtot = HBHiggs(parity, ahiggs, adjstaple); + + if (is_multicanonical) mc_acceptance(parity, rtot); + } + + free_tmp( adjstaple ); +} + + +/************************************************************* + * * + * calculate the gauge + adjoint Higgs link * + * * + ************************************************************/ + +void get_adjstaple(int parity, adjoint_matrix *adjstaple) +{ + /* calculate the adjoint-gauge link action ('staple') + * + * ab |-- ub --> X |-- uf --> af + * + * 2 beta tr (a [u an u']) + * + * Note that [ ] is also 'adjoint!' + */ + + register int i,dir, odir, otherparity; + msg_tag *tag0,*tag1; + su3_matrix tmat1,tmat2; + adjoint_matrix tadj; + + /* Loop over directions, computing force from links */ + + otherparity = opp_parity( parity ); + foralldir(dir) { + odir = opp_dir(dir); + + /* start gather of up-adjoint link */ + + tag0 = start_get( ahiggs, dir, parity ); + + /* multiply adjoint here with up-link, for opp-parity */ + forparity(i,otherparity) { + prefetch_matrix( &U[dir][i+1] ); + prefetch_adjoint( &ahiggs[i+1] ); + uncompress_adjmat(&ahiggs[i],&tmat1); + mult_su3_an( &U[dir][i], &tmat1, &tmat2 ); + mult_su3_nn( &tmat2, &U[dir][i], &tmat1 ); + compress_adjmat( &tmat1, &adjstaple[i] ); + } + + tag1 = start_get( adjstaple, odir, parity ); + + wait_get(tag0); + + /* multiply link with up-adjoint */ + if (dir == XUP) forparity(i,parity) { + prefetch_matrix( &U[dir][i+1] ); + prefetch_adjoint( &ahiggs[nb(dir,i+1)] ); + uncompress_adjmat( &ahiggs[nb(dir,i)], &tmat1); + mult_su3_nn( &U[dir][i], &tmat1, &tmat2 ); + mult_su3_na( &tmat2, &U[dir][i], &tmat1 ); + compress_adjmat( &tmat1, &adjstaple[i] ); + } else forparity(i,parity) { + prefetch_matrix( &U[dir][i+1] ); + prefetch_adjoint( &ahiggs[nb(dir,i+1)] ); + prefetch_adjoint( &adjstaple[i+1] ); + uncompress_adjmat( &ahiggs[nb(dir,i)], &tmat1); + mult_su3_nn( &U[dir][i], &tmat1, &tmat2 ); + mult_su3_na( &tmat2, &U[dir][i], &tmat1 ); + compress_adjmat( &tmat1, &tadj ); + add_adjmat( &adjstaple[i], &tadj, &adjstaple[i] ); + } + + wait_get(tag1); + forparity(i,parity) { + prefetch_adjoint( &adjstaple[i+1] ); + prefetch_adjoint( &adjstaple[nb(odir,i+1)] ); + add_adjmat( &adjstaple[i], &adjstaple[nb(odir,i)] , &adjstaple[i] ); + } + } +} + + +/****************************************************** + * needed for adjoint acceptance + * note-first su3_matrix contains the 'local' adjoint + * matrix uncompressed + * + * acc/rej with + * + * a |-- u --> an + * + * -2 beta tr (a u an u') + * + *****************************************************/ + +double act_gauge_adj(su3_matrix *a, su3_matrix *u,adjoint_matrix *b) +{ + su3_matrix am,m1,m2; + + uncompress_adjmat(b,&am); + mult_su3_nn(a,u,&m1); + mult_su3_na(u,&am,&m2); + return((-2.0)*betaA*realtrace_su3(&m1,&m2)); /* m2' * m1 */ +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/CHANGELOG b/qcd/part_cpu/applications/QCD/src/kernel_C/CHANGELOG new file mode 100644 index 0000000000000000000000000000000000000000..4202f06ea9f1a71632e3c5b9012fcd06f26c618c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/CHANGELOG @@ -0,0 +1,8 @@ + +22. September 2008 + +DD-HMC-BM-1.0: Initial release + +15. April 2016 + +kernel_C : Updated to openQCD-1.4-bgopt \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/COPYING b/qcd/part_cpu/applications/QCD/src/kernel_C/COPYING new file mode 100644 index 0000000000000000000000000000000000000000..7a8e8abfd0057f374fbf59076c263f1f5d685b73 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/COPYING @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..052a9fa61a736a1cc30f4287ff47679e93327308 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/Makefile @@ -0,0 +1,2 @@ +kernel: + cd main && gmake kernel \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/Makefile.defs.in b/qcd/part_cpu/applications/QCD/src/kernel_C/Makefile.defs.in new file mode 100644 index 0000000000000000000000000000000000000000..809981d6ecb398371b9164abf11e469e880de0a5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/Makefile.defs.in @@ -0,0 +1,12 @@ +CC = #MPI_CC# +CFLAGS = #CFLAGS# + +SHELL = #SHELL# + +AR = #AR# +ARFLAGS = #ARFLAGS# + +LD = #LD# +LDFLAGS = #LDFLAGS# + +RM = #RM# \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/README b/qcd/part_cpu/applications/QCD/src/kernel_C/README new file mode 100644 index 0000000000000000000000000000000000000000..c97f5b5e6c58021403d0a9943b8c214baa0a1911 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/README @@ -0,0 +1,209 @@ + +################################################################################ + + QCD SPEED TESTS + +################################################################################ + + +This document is short guide to get started and run the speed tests. For +more detailed information see the README.extended. + + +PROGRAMS + +The benchmark programs are provided in source form and must be +compiled by the user on the machine that is to be tested. + +In addition the openQCD-1.4 package is needed. A tar-file of the +source code can be obtained from + +http://luscher.web.cern.ch/luscher/openQCD/ + +and should be extracted in the same directory level as this package. + +PROGRAM FEATURES + +All programs parallelize in 0,1,2,3 or 4 dimensions, depending on what is +specified at compilation time. They are highly optimized for machines with +current Intel or AMD processors, but will run correctly on any system that +complies with the ISO C89 (formerly ANSI C) and the MPI 1.2 standards. + +For the purpose of testing and code development, the programs can also +be run on a desktop or laptop computer. All what is needed for this is +a compliant C compiler and a local MPI installation such as Open MPI. + + +DOCUMENTATION + +The simulation program has a modular form, with strict prototyping and a +minimal use of external variables. Each program file contains a small number +of externally accessible functions whose functionality is described at the top +of the file. + +The data layout is explained in various README files and detailed instructions +are given on how to run the main programs. A set of further documentation +files are included in the doc directory, where the normalization conventions, +the chosen algorithms and other important program elements are described. + + +COMPILATION + +The compilation of the programs requires an ISO C89 compliant compiler and a +compatible MPI installation that complies with the MPI standard 1.2 (or later). + +In the main and devel directories, a GNU-style Makefile is included which +compiles and links the programs (just type "make" to compile everything; "make +clean" removes the files generated by "make"). The compiler options can be set +by editing the CFLAGS line in the Makefiles. + +The Makefiles assume that the following environment variables are set: + + GCC GNU C compiler command [Example: /usr/bin/gcc]. + + MPI_HOME MPI home directory [Example: /usr/lib64/mpi/gcc/openmpi]. + The mpicc command used is the one in $MPI_HOME/mpicc and + the MPI libraries are expected in $MPI_HOME/lib. + + MPI_INCLUDE Directory where the mpi.h file is to be found. + +All programs are then compiled using the $MPI_HOME/bin/mpicc command. The +compiler options that can be set in the CFLAGS line depend on which C compiler +the mpicc command invokes (the GCC compiler command is only used to resolve +the dependencies on the include files). + + +SSE/AVX ACCELERATION + +Current Intel and AMD processors are able to perform arithmetic operations on +short vectors of floating-point numbers in just one or two machine cycles, +using SSE and/or AVX instructions. The arithmetic performed by these +instructions fully complies with the IEEE 754 standard. + +Many programs in the module directories include SSE and AVX inline-assembly +code. On 64bit systems, and if the GNU or Intel C compiler is used, the code +can be activated by setting the compiler flags -Dx64 and -DAVX, respectively. +In addition, SSE prefetch instructions will be used if one of the following +options is specified: + + -DP4 Assume that prefetch instructions fetch 128 bytes at a time + (Pentium 4 and related Xeons). + + -DPM Assume that prefetch instructions fetch 64 bytes at a time + (Athlon, Opteron, Pentium M, Core, Core 2 and related Xeons). + + -DP3 Assume that prefetch instructions fetch 32 bytes at a time + (Pentium III). + +These options have an effect only if -Dx64 or -DAVX is set. The option +-DAVX implies -Dx64. + +On recent x86-64 machines with AMD Opteron or Intel Xeon processors, for +example, the recommended compiler flags are + + -std=c89 -O -mno-avx -DAVX -DPM + +For older machines that do not support the AVX instruction set, the +recommended flags are + + -std=c89 -O -mno-avx -Dx64 -DPM + +More aggressive optimization levels such as -O2 and -O3 tend to have little +effect on the execution speed of the programs, but the risk of generating +wrong code is higher. + +AVX instructions and the option -mno-avx may not be known to old versions +of the compilers, in which case one is limited to SSE accelerations with +option string -std=c89 -O -Dx64 -DPM. + + +DEBUGGING FLAGS + +For troubleshooting and parameter tuning, it may helpful to switch on some +debugging flags at compilation time. The simulation program then prints a +detailed report to the log file on the progress made in specified subprogram. + +The available flags are: + +-DCGNE_DBG CGNE solver. + +-DFGCR_DBG GCR solver. + +-FGCR4VD_DBG GCR solver for the little Dirac equation. + +-DMSCG_DBG MSCG solver. + +-DDFL_MODES_DBG Deflation subspace generation. + +-DMDINT_DBG Integration of the molecular-dynamics equations. + +-DRWRAT_DBG Computation of the rational function reweighting + factor. + + +RUNNING A SIMULATION + +The simulation programs reside in the directory "main". For each program, +there is a README file in this directory which describes the program +functionality and its parameters. + +Running a simulation for the first time requires its parameters to be chosen, +which tends to be a non-trivial task. The syntax of the input parameter files +and the meaning of the various parameters is described in some detail in +main/README.infiles and doc/parms.pdf. Examples of valid parameter files are +contained in the directory main/examples. + + +EXPORTED FIELD FORMAT + +The field configurations generated in the course of a simulation are written +to disk in a machine-independent format (see modules/misc/archive.c). +Independently of the machine endianness, the fields are written in little +endian format. A byte-reordering is therefore not required when machines with +different endianness are used for the simulation and the physics analysis. + + +AUTHORS + +The initial release of the openQCD package was written by Martin Lüscher and +Stefan Schaefer. Support for Schrödinger functional boundary conditions was +added by John Bulava. Several modules were taken over from the DD-HMC program +tree, which includes contributions from Luigi Del Debbio, Leonardo Giusti, +Björn Leder and Filippo Palombi. + + +ACKNOWLEDGEMENTS + +In the course of the development of the openQCD code, many people suggested +corrections and improvements or tested preliminary versions of the programs. +The authors are particularly grateful to Isabel Campos, Dalibor Djukanovic, +Georg Engel, Leonardo Giusti, Björn Leder, Carlos Pena and Hubert Simma for +their communications and help. + + +LICENSE + +The software may be used under the terms of the GNU General Public Licence +(GPL). + + +BUG REPORTS + +If a bug is discovered, please send a report to . + + +ALTERNATIVE PACKAGES AND COMPLEMENTARY PROGRAMS + +There is a publicly available BG/Q version of openQCD that takes advantage of +the machine-specific features of IBM BlueGene/Q computers. The version is +available at . + +The openQCD programs currently do not support reweighting in the quark +masses, but a module providing this functionality can be downloaded from +. + +Previously generated gauge-field configurations are often used as initial +configuration for a new run. If the old and new lattices or boundary +conditions are not the same, the old configuration may however need to be +adapted, using a field conversion tool such as the one available at +, before the new run is started. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/global.defs.in b/qcd/part_cpu/applications/QCD/src/kernel_C/global.defs.in new file mode 100644 index 0000000000000000000000000000000000000000..628a5315a95eb0494bcc4f4a5208170c68362448 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/global.defs.in @@ -0,0 +1,8 @@ +#define NPROC0 #NPROC0# +#define NPROC1 #NPROC1# +#define NPROC2 #NPROC2# +#define NPROC3 #NPROC3# +#define L0 #L0# +#define L1 #L1# +#define L2 #L2# +#define L3 #L3# diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/include/bm.h b/qcd/part_cpu/applications/QCD/src/kernel_C/include/bm.h new file mode 100644 index 0000000000000000000000000000000000000000..37dd8a8d7bca60a10f6fa9efd32a1e17ceeb6b8f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/include/bm.h @@ -0,0 +1,35 @@ +/******************************************************************************* +* +* File bm.h +* +* Copyright (C) 2008 Bjorn Leder +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef BM_H +#define BM_H + +#define NMR 4 +#define NCY 5 +#define NS 16 + +#ifndef CG_ITER_C +extern void time_cg_iter(FILE *flog, double *wdt); +#endif + +#ifndef CG_ITER_DBLE_C +extern void time_cg_iter_dble(FILE *flog, double *wdt); +#endif + +#ifndef TIME_MSAP_C +extern void time_msap(FILE *flog, double *wdt); +#endif + +#ifndef TIME_AWHAT_C +extern void time_Awhat(FILE *flog, double *wdt, int* nb); +#endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/.Makefile.oqcd.kate-swp b/qcd/part_cpu/applications/QCD/src/kernel_C/main/.Makefile.oqcd.kate-swp new file mode 100644 index 0000000000000000000000000000000000000000..2f97f3847e6a1850291c1922b749090459be5a6d Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/main/.Makefile.oqcd.kate-swp differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..0a5af41bf67a4d34458572cb63ee7524a4ef04ed --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile @@ -0,0 +1,151 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +include ../Makefile.defs + +all: rmxeq mkdep mkxeq +.PHONY: all + +# main programs and modules to be compiled + +MAIN = time1 time2 time3 time3test + +ARCHIVE = archive sarchive + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +DFL = dfl_geometry dfl_subspace ltl_gcr dfl_sap_gcr dfl_modes + +DIRAC = Dw_dble Dw Dw_bnd + +FLAGS = flags action_parms dfl_parms force_parms hmc_parms lat_parms \ + mdint_parms rat_parms rw_parms sap_parms solver_parms + +FORCES = force0 force1 force2 force3 force4 force5 \ + frcfcts genfrc tmcg tmcgm xtensor + +LATTICE = bcnds uidx ftidx geometry + +LINALG = salg salg_dble valg valg_dble liealg cmatrix_dble cmatrix + +LINSOLV = cgne mscg fgcr fgcr4vd + +LITTLE = Aw_gen Aw_com Aw_ops Aw_dble Aw ltl_modes + +MDFLDS = mdflds fcom + +RANDOM = ranlux ranlxs ranlxd gauss + +RATFCTS = elliptic zolotarev ratfcts + +SAP = sap_com sap_gcr sap blk_solv + +SFLDS = sflds scom sdcom Pbnd Pbnd_dble + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +SW_TERM = pauli pauli_dble swflds sw_term + +TCHARGE = ftcom ftensor tcharge ym_action + +UFLDS = plaq_sum uflds udcom bstap + +UPDATE = chrono mdsteps counters mdint hmc rwtm rwtmeo rwrat + +UTILS = endian mutils utils wspace + +VFLDS = vflds vinit vcom vdcom + +BM = cg_iter cg_iter_dble time_msap time_Awhat + +MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ + $(LATTICE) $(LINALG) $(LINSOLV) $(LITTLE) $(MDFLDS) $(RANDOM) \ + $(RATFCTS) $(SAP) $(SFLDS) $(SU3FCTS) $(SW_TERM) $(TCHARGE) \ + $(UFLDS) $(UPDATE) $(UTILS) $(VFLDS) $(WFLOW) $(BM) + + +# openQCD distribution + +OQCD = ../openQCD-1.4-bgopt + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = $(OQCD)/modules + +MDIR_BM = ../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/mdflds:$(MDIR)/su3fcts:\ + $(MDIR)/utils:$(MDIR)/forces:$(MDIR)/sflds:$(MDIR)/dirac:\ + $(MDIR)/sw_term:$(MDIR)/tcharge:$(MDIR)/block:$(MDIR)/sap:\ + $(MDIR)/linsolv:$(MDIR)/dfl:$(MDIR)/vflds:$(MDIR)/little:\ + $(MDIR)/update:$(MDIR)/wflow:$(MDIR)/ratfcts:$(MDIR_BM)/bm + + +# additional include directories + +INCPATH = $(MPIR_HOME)/include $(OQCD)/include/ ../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPIR_HOME)/lib + + +############################## do not change ################################### + +PGMS= $(MAIN) $(MODULES) + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(LD) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + +kernel: $(addsuffix .o,$(MODULES)) Makefile time3.o + $(AR) $(ARFLAGS) ../../kernel_C.a $(addsuffix .o,$(MODULES)) time3.o + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + $(RM) $(MAIN); \ + echo "delete old executables" + + +# clean directory + +clean: + $(RM) -r *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.amd b/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.amd new file mode 100644 index 0000000000000000000000000000000000000000..6feb9cb6443997f7ac8e2bd888d18784bee02700 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.amd @@ -0,0 +1,154 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + +# main programs and modules to be compiled + +MAIN = time1 time2 time3 + +RANDOM = ranlxs ranlxd gauss random_su3 + +START = geometry start sinit utils flipbc + +FLAGS = flags parms wspace + +MISC = mutils endian sse_fcts su3_fcts su3_prods \ + cmatrix_dble cmatrix + +LINALG = linalg linalg_dble valg valg_dble + +LINSOLV = fgcr fgcr4vd + +SW_TERM = pauli pauli_dble swinit sw_term blk_sw_term + +DIRAC = Pbnd Qbnd Qhat scom Pbnd_dble Qbnd_dble Qhat_dble sdcom + +UPDATE = ucom shift + +BLOCK = block blk_grid blk_umap blk_swmap blk_smap + +EVA = jacobi_dble + +DFL = vgrid vflds vinit vcom vdcom Zgen Zops Zhat Zhat_dble dfl_subspace \ + dfl dfl_sap_gcr dfl_modes + +SAP_GCR = blk_solv msap sap_gcr + +HMC = chrono liealg hmcflds + +BM = cg_iter cg_iter_dble time_msap time_Zhat + +MODULES = $(RANDOM) $(START) $(FLAGS) $(MISC) \ + $(LINALG) $(LINSOLV) $(SW_TERM) $(DIRAC) $(UPDATE) $(BLOCK) \ + $(EVA) $(DFL) $(SAP_GCR) $(HMC) $(BM) + + +# DD-HMC distribution + +DDHMC = ../../DD-HMC-1.2.2 + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = $(DDHMC)/modules + +MDIR_BM = ../modules + +VPATH = .:$(MDIR)/random:$(MDIR)/start:$(MDIR)/flags:$(MDIR)/misc:\ + $(MDIR)/linalg:$(MDIR)/linsolv:$(MDIR)/sw_term:$(MDIR)/dirac:\ + $(MDIR)/update:$(MDIR)/block:$(MDIR)/eva:$(MDIR)/dfl:$(MDIR)/sap_gcr:\ + $(MDIR)/hmc:$(MDIR_BM)/bm: + + +# additional include directories + +INCPATH = $(MPIR_HOME)/include $(DDHMC)/include/ ../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPIR_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -Wall -Wno-long-long \ + -O3 -m32 -malign-double -msse3 -DSSE3 -DPM \ + -Wstrict-prototypes -fstrict-aliasing -Werror + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPIR_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.bgl b/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.bgl new file mode 100644 index 0000000000000000000000000000000000000000..bbe9d46603f58d7b4fb0ae25224d363a42822231 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.bgl @@ -0,0 +1,159 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + +# main programs and modules to be compiled + +MAIN = time1 time2 time3 + +RANDOM = ranlxs ranlxd gauss random_su3 + +START = geometry start sinit utils flipbc + +FLAGS = flags parms wspace + +MISC = mutils endian sse_fcts su3_fcts su3_prods \ + cmatrix_dble cmatrix + +LINALG = linalg linalg_dble valg valg_dble + +LINSOLV = fgcr fgcr4vd + +SW_TERM = pauli pauli_dble swinit sw_term blk_sw_term + +DIRAC = Pbnd Qbnd Qhat scom Pbnd_dble Qbnd_dble Qhat_dble sdcom + +UPDATE = ucom shift + +BLOCK = block blk_grid blk_umap blk_swmap blk_smap + +EVA = jacobi_dble + +DFL = vgrid vflds vinit vcom vdcom Zgen Zops Zhat Zhat_dble dfl_subspace \ + dfl dfl_sap_gcr dfl_modes + +SAP_GCR = blk_solv msap sap_gcr + +HMC = chrono liealg hmcflds + +BM = cg_iter cg_iter_dble time_msap time_Zhat + +MODULES = $(RANDOM) $(START) $(FLAGS) $(MISC) \ + $(LINALG) $(LINSOLV) $(SW_TERM) $(DIRAC) $(UPDATE) $(BLOCK) \ + $(EVA) $(DFL) $(SAP_GCR) $(HMC) $(BM) + + +# DD-HMC distribution + +DDHMC = ../../DD-HMC-1.2.2 + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = $(DDHMC)/modules + +MDIR_BM = ../modules + +VPATH = .:$(MDIR)/random:$(MDIR)/start:$(MDIR)/flags:$(MDIR)/misc:\ + $(MDIR)/linalg:$(MDIR)/linsolv:$(MDIR)/sw_term:$(MDIR)/dirac:\ + $(MDIR)/update:$(MDIR)/block:$(MDIR)/eva:$(MDIR)/dfl:$(MDIR)/sap_gcr:\ + $(MDIR)/hmc:$(MDIR_BM)/bm: + + +# additional include directories + +BGLSYS = /bgl/BlueLight/ppcfloor/bglsys +INCPATH = $(BGLSYS)/include $(DDHMC)/include/ ../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(BGLSYS)/lib + + +# scheduling and optimization options + +CFLAGS = -O3 -qstrict -qarch=440 -qtune=440 -DDH -DSF -DTWBC + +# modules with routines that use Double Hummer intrinsics +DH = linalg_dble linalg pauli_dble pauli Qhat_dble Qhat sdcom scom msap sinit + +# add -qarch=440d to CFLAGS only for modules with Double Hummer intrinsics +$(addsuffix .o,$(DH)): CFLAGS += -qarch=440d + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(BGLSYS)/bin/mpixlc +GCC=/usr/bin/gcc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.intel b/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.intel new file mode 100644 index 0000000000000000000000000000000000000000..fdf531ffaf436b23668e071364488f8527cb8b36 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.intel @@ -0,0 +1,154 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + +# main programs and modules to be compiled + +MAIN = time1 time2 time3 + +RANDOM = ranlxs ranlxd gauss random_su3 + +START = geometry start sinit utils flipbc + +FLAGS = flags parms wspace + +MISC = mutils endian sse_fcts su3_fcts su3_prods \ + cmatrix_dble cmatrix + +LINALG = linalg linalg_dble valg valg_dble + +LINSOLV = fgcr fgcr4vd + +SW_TERM = pauli pauli_dble swinit sw_term blk_sw_term + +DIRAC = Pbnd Qbnd Qhat scom Pbnd_dble Qbnd_dble Qhat_dble sdcom + +UPDATE = ucom shift + +BLOCK = block blk_grid blk_umap blk_swmap blk_smap + +EVA = jacobi_dble + +DFL = vgrid vflds vinit vcom vdcom Zgen Zops Zhat Zhat_dble dfl_subspace \ + dfl dfl_sap_gcr dfl_modes + +SAP_GCR = blk_solv msap sap_gcr + +HMC = chrono liealg hmcflds + +BM = cg_iter cg_iter_dble time_msap time_Zhat + +MODULES = $(RANDOM) $(START) $(FLAGS) $(MISC) \ + $(LINALG) $(LINSOLV) $(SW_TERM) $(DIRAC) $(UPDATE) $(BLOCK) \ + $(EVA) $(DFL) $(SAP_GCR) $(HMC) $(BM) + + +# DD-HMC distribution + +DDHMC = ../../DD-HMC-1.2.2 + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = $(DDHMC)/modules + +MDIR_BM = ../modules + +VPATH = .:$(MDIR)/random:$(MDIR)/start:$(MDIR)/flags:$(MDIR)/misc:\ + $(MDIR)/linalg:$(MDIR)/linsolv:$(MDIR)/sw_term:$(MDIR)/dirac:\ + $(MDIR)/update:$(MDIR)/block:$(MDIR)/eva:$(MDIR)/dfl:$(MDIR)/sap_gcr:\ + $(MDIR)/hmc:$(MDIR_BM)/bm: + + +# additional include directories + +INCPATH = $(MPIR_HOME)/include $(DDHMC)/include/ ../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPIR_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -Wall -Wno-long-long \ + -O3 -m32 -malign-double -msse3 -DSSE3 -DP4 \ + -Wstrict-prototypes -fstrict-aliasing -Werror + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPIR_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.oqcd b/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.oqcd new file mode 100644 index 0000000000000000000000000000000000000000..b2612ac023b149c7ec8f6f725ee508ebc0dc8e22 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.oqcd @@ -0,0 +1,159 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +CC=gcc +LD=mpicc +## include ../Makefile.defs + +all: rmxeq mkdep mkxeq +.PHONY: all + +# main programs and modules to be compiled + +MAIN = time1 time2 time3 time3test + +ARCHIVE = archive sarchive + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +DFL = dfl_geometry dfl_subspace ltl_gcr dfl_sap_gcr dfl_modes + +DIRAC = Dw_dble Dw Dw_bnd + +FLAGS = flags action_parms dfl_parms force_parms hmc_parms lat_parms \ + mdint_parms rat_parms rw_parms sap_parms solver_parms + +FORCES = force0 force1 force2 force3 force4 force5 \ + frcfcts genfrc tmcg tmcgm xtensor + +LATTICE = bcnds uidx ftidx geometry + +LINALG = salg salg_dble valg valg_dble liealg cmatrix_dble cmatrix + +LINSOLV = cgne mscg fgcr fgcr4vd + +LITTLE = Aw_gen Aw_com Aw_ops Aw_dble Aw ltl_modes + +MDFLDS = mdflds fcom + +RANDOM = ranlux ranlxs ranlxd gauss + +RATFCTS = elliptic zolotarev ratfcts + +SAP = sap_com sap_gcr sap blk_solv + +SFLDS = sflds scom sdcom Pbnd Pbnd_dble + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +SW_TERM = pauli pauli_dble swflds sw_term + +TCHARGE = ftcom ftensor tcharge ym_action + +UFLDS = plaq_sum uflds udcom bstap + +UPDATE = chrono mdsteps counters mdint hmc rwtm rwtmeo rwrat + +UTILS = endian mutils utils wspace + +VFLDS = vflds vinit vcom vdcom + +BM = cg_iter cg_iter_dble time_msap time_Awhat + +##cg_iter_dble time_msap time_Zhat + +MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ + $(LATTICE) $(LINALG) $(LINSOLV) $(LITTLE) $(MDFLDS) $(RANDOM) \ + $(RATFCTS) $(SAP) $(SFLDS) $(SU3FCTS) $(SW_TERM) $(TCHARGE) \ + $(UFLDS) $(UPDATE) $(UTILS) $(VFLDS) $(WFLOW) $(BM) + + +# DD-HMC distribution + +OQCD = ../openQCD-1.4-bgopt + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = $(OQCD)/modules + +MDIR_BM = ../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/mdflds:$(MDIR)/su3fcts:\ + $(MDIR)/utils:$(MDIR)/forces:$(MDIR)/sflds:$(MDIR)/dirac:\ + $(MDIR)/sw_term:$(MDIR)/tcharge:$(MDIR)/block:$(MDIR)/sap:\ + $(MDIR)/linsolv:$(MDIR)/dfl:$(MDIR)/vflds:$(MDIR)/little:\ + $(MDIR)/update:$(MDIR)/wflow:$(MDIR)/ratfcts:$(MDIR_BM)/bm + + +# additional include directories + +INCPATH = /usr/lib/openmpi/include $(OQCD)/include/ ../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(OMPIR_HOME)/lib + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall \ + -O -Dx64 -DPM -Wno-long-long + +## -Wno-long-long -Wstrict-prototypes -Werror +############################## do not change ################################### + +PGMS= $(MAIN) $(MODULES) + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(LD) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + +kernel: $(addsuffix .o,$(MODULES)) Makefile time3.o + $(AR) $(ARFLAGS) ../../kernel_C.a $(addsuffix .o,$(MODULES)) time3.o + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + $(RM) $(MAIN); \ + echo "delete old executables" + + +# clean directory + +clean: + $(RM) -r *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/time1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time1.c new file mode 100644 index 0000000000000000000000000000000000000000..ca5d262c41db17ad0d189aae9150d7efa4357bb5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time1.c @@ -0,0 +1,113 @@ + +/******************************************************************************* +* +* File time1.c +* +* Copyright (C) 2008 Martin Luescher, Bjorn Leder , 2016 Jacob Finkenrath +* +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* QCD single-precision speed test +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "dirac.h" +#include "global.h" +#include "bm.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "archive.h" +#include "forces.h" +#include "update.h" +#include "version.h" +#include "sw_term.h" + +int main(int argc,char *argv[]) +{ + int my_rank; + double cg_wdt[3],wdt; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time1.log","w",stdout); + error_root(flog==NULL,1,"main [time1.c]","Unable to open log file"); + + printf("\n"); + printf("QCD single-precision speed test\n"); + printf("-------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + if (NPROC>1) + printf("There are %d MPI processes\n",NPROC); + else + printf("There is 1 MPI process\n"); + +#if (defined SSE3) + printf("Using inline assembly SSE3 instructions\n"); +#elif (defined SSE2) + printf("Using inline assembly SSE2 instructions\n"); +#elif (defined SSE) + printf("Using inline assembly SSE instructions\n"); +#endif + +#if (defined SSE) +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + + printf("\n"); + } + + time_cg_iter(flog,cg_wdt); + + wdt=2.0*cg_wdt[0]+3.0*cg_wdt[1]+2.0*cg_wdt[2]; + + if (my_rank==0) + { + printf("########################################################\n"); + printf("# #\n"); + printf("# SYNTHETIC QCD SPEED TEST #\n"); + printf("# #\n"); + printf("# Using single-precision (%d bit) data and programs #\n", + 8*(int)(sizeof(float))); + printf("# #\n"); + printf("# Time per lattice point: %8.3f micro sec #\n", + wdt); + printf("# Average speed: %8.3f Gflops/process #\n", + 1.0e-3*4200.0/wdt); + printf("# Total throughput: %8.3f Gflops #\n", + 1.0e-3*(double)(NPROC)*4200.0/wdt); + printf("# #\n"); + printf("########################################################\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/time1.log b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time1.log new file mode 100644 index 0000000000000000000000000000000000000000..d7dac4390bc866e0853492d7e92302ee47f8c246 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time1.log @@ -0,0 +1,41 @@ + +QCD single-precision speed test +------------------------------- + +16x8x8x8 lattice, 2x1x1x1 process grid, 8x8x8x8 local lattice + +There are 2 MPI processes + + +Single-precision data and programs +------------------------------- + +The local size of the gauge field is 1152 KB +The local size of a quark field is 384 KB + +Lattice parameters: +beta = 5.5 +c0 = 1.0, c1 = 0.0 +csw = 1.978 + +Program norm_square: +Time per lattice point: 0.018 micro sec (2723 Mflops/process) + +Program mulc_spinor_add: +Time per lattice point: 0.052 micro sec (1855 Mflops/process) + +Program Dhat: +Time per lattice point: 0.183 micro sec (10402 Mflops/process) + +######################################################## +# # +# SYNTHETIC QCD SPEED TEST # +# # +# Using single-precision (32 bit) data and programs # +# # +# Time per lattice point: 0.557 micro sec # +# Average speed: 7.494 Gflops/process # +# Total throughput: 14.988 Gflops # +# # +######################################################## + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/time2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time2.c new file mode 100644 index 0000000000000000000000000000000000000000..f5953921851f71f379176230ecfb40e6057b3f1f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time2.c @@ -0,0 +1,113 @@ + +/******************************************************************************* +* +* File time2.c +* +* Copyright (C) 2008 Martin Luescher, Bjorn Leder, 2016 Jacob Finkenrath +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* QCD double-precision speed test +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "global.h" +#include "bm.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "archive.h" +#include "forces.h" +#include "update.h" +#include "version.h" +#include "sw_term.h" +#include "dirac.h" + +int main(int argc,char *argv[]) +{ + int my_rank; + double cgd_wdt[3],wdt; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time2.log","w",stdout); + error_root(flog==NULL,1,"main [time2.c]","Unable to open log file"); + + printf("\n"); + printf("QCD double-precision speed test\n"); + printf("-------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + if (NPROC>1) + printf("There are %d MPI processes\n",NPROC); + else + printf("There is 1 MPI process\n"); + +#if (defined SSE3) + printf("Using inline assembly SSE3 instructions\n"); +#elif (defined SSE2) + printf("Using inline assembly SSE2 instructions\n"); +#elif (defined SSE) + printf("Using inline assembly SSE instructions\n"); +#endif + +#if (defined SSE) +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + + printf("\n"); + } + + time_cg_iter_dble(flog,cgd_wdt); + + wdt=2.0*cgd_wdt[0]+3.0*cgd_wdt[1]+2.0*cgd_wdt[2]; + + if (my_rank==0) + { + printf("########################################################\n"); + printf("# #\n"); + printf("# SYNTHETIC QCD SPEED TEST #\n"); + printf("# #\n"); + printf("# Using double-precision (%d bit) data and programs #\n", + 8*(int)(sizeof(double))); + printf("# #\n"); + printf("# Time per lattice point: %8.3f micro sec #\n", + wdt); + printf("# Average speed: %8.3f Gflops/process #\n", + 1.0e-3*4200.0/wdt); + printf("# Total throughput: %8.3f Gflops #\n", + 1.0e-3*(double)(NPROC)*4200.0/wdt); + printf("# #\n"); + printf("########################################################\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/time2.log b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time2.log new file mode 100644 index 0000000000000000000000000000000000000000..01221333ca2471b19b8b9671ef38a3ab554b6e7f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time2.log @@ -0,0 +1,41 @@ + +QCD double-precision speed test +------------------------------- + +16x8x8x8 lattice, 2x1x1x1 process grid, 8x8x8x8 local lattice + +There are 2 MPI processes + + +Double-precision data and programs +------------------------------- + +The local size of the gauge field is 2304 KB +The local size of a quark field is 768 KB + +Lattice parameters: +beta = 5.5 +c0 = 1.0, c1 = 0.0 +csw = 1.978 + +Program norm_square_dble: +Time per lattice point: 0.035 micro sec (1358 Mflops/process) + +Program mulc_spinor_add_dble: +Time per lattice point: 0.110 micro sec (871 Mflops/process) + +Program Dhat_dble: +Time per lattice point: 0.377 micro sec (5063 Mflops/process) + +######################################################## +# # +# SYNTHETIC QCD SPEED TEST # +# # +# Using double-precision (64 bit) data and programs # +# # +# Time per lattice point: 1.155 micro sec # +# Average speed: 3.616 Gflops/process # +# Total throughput: 7.232 Gflops # +# # +######################################################## + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3.c new file mode 100644 index 0000000000000000000000000000000000000000..46ee7fd394866b36f0fa46650f51e37edeef0a44 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3.c @@ -0,0 +1,190 @@ + +/******************************************************************************* +* +* File time3.c +* +* Copyright (C) 2008 Bjorn Leder, 2016 Jacob Finkenrath +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* QCD speed test +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "global.h" +#include "bm.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "archive.h" +#include "forces.h" +#include "update.h" +#include "version.h" +#include "sw_term.h" +#include "dirac.h" + +#if (((L0%4)!=0)||((L1%4)!=0)||((L2%4)!=0)||((L3%4)!=0)) +#error: The local lattice sizes must be a multiple of 4 +#endif + + +int kernel_c() +{ + int my_rank,nb, kernel_number=2; + double cg_wdt[3],cgd_wdt[3],msap_wdt,ahat_wdt,wdt; + FILE *flog=NULL; + + /* JuBE */ + /* call jube initial function */ + jube_kernel_init(&kernel_number); + +/* MPI_Init(&argc,&argv); */ + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { +/* flog=freopen("time3.log","w",stdout); */ +/* error_root(flog==NULL,1,"main [time3.c]","Unable to open log file"); */ + + flog = stdout; + + printf("\n"); + printf("QCD speed test\n"); + printf("-------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + if (NPROC>1) + printf("There are %d MPI processes\n",NPROC); + else + printf("There is 1 MPI process\n"); + +#if (defined SSE3) + printf("Using inline assembly SSE3 instructions\n"); +#elif (defined SSE2) + printf("Using inline assembly SSE2 instructions\n"); +#elif (defined SSE) + printf("Using inline assembly SSE instructions\n"); +#endif + +#if (defined SSE) +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + + printf("\n"); + } + + /* JuBE */ + /* call jube run function */ + jube_kernel_run(); + + time_cg_iter(flog,cg_wdt); + time_cg_iter_dble(flog,cgd_wdt); + time_msap(flog,&msap_wdt); + time_Awhat(flog,&ahat_wdt,&nb); + + /* JuBE */ + /* call jube finalize function */ + jube_kernel_finalize(); + + wdt=2.0*cg_wdt[0]+3.0*cg_wdt[1]+2.0*cg_wdt[2]; + + if (my_rank==0) + { + printf("\n"); + printf("#########################################################\n"); + printf("# #\n"); + printf("# SYNTHETIC QCD SPEED TEST #\n"); + printf("# #\n"); + printf("# Using single-precision (%d bit) data and programs #\n", + 8*(int)(sizeof(float))); + printf("# #\n"); + printf("# Time per lattice point: %8.3f micro sec #\n", + wdt); + printf("# Average speed: %8.3f Gflops/process #\n", + 1.0e-3*4200.0/wdt); + printf("# Total throughput: %8.3f Gflops #\n", + 1.0e-3*(double)(NPROC)*4200.0/wdt); + printf("# #\n"); + } + + wdt=2.0*cgd_wdt[0]+3.0*cgd_wdt[1]+2.0*cgd_wdt[2]; + + if (my_rank==0) + { + printf("# #\n"); + printf("# Using double-precision (%d bit) data and programs #\n", + 8*(int)(sizeof(double))); + printf("# #\n"); + printf("# Time per lattice point: %8.3f micro sec #\n", + wdt); + printf("# Average speed: %8.3f Gflops/process #\n", + 1.0e-3*4200.0/wdt); + printf("# Total throughput: %8.3f Gflops #\n", + 1.0e-3*(double)(NPROC)*4200.0/wdt); + printf("# #\n"); + } + + if (my_rank==0) + { + printf("# #\n"); + printf("# Using the Schwarz preconditioner [%d bit arithmetic] #\n", + 8*(int)(sizeof(float))); + printf("# #\n"); + printf("# Time per lattice point: %8.3f micro sec #\n", + msap_wdt); + printf("# Average speed: %8.3f Gflops/process #\n", + 1.0e-3*((double)((NMR+1)*2076+48)+112.0*2.0)/msap_wdt); + printf("# Total throughput: %8.3f Gflops #\n", + 1.0e-3*((double)((NMR+1)*2076+48)+112.0*2.0)/msap_wdt); + printf("# #\n"); + } + + if (my_rank==0) + { + printf("# #\n"); + printf("# Using deflation (little Dirac operator) #\n"); + printf("# [%d bit arithmetic] #\n", + (int)(4*sizeof(complex))); + printf("# #\n"); + printf("# Time per lattice point: %8.3f micro sec #\n", + ahat_wdt/(double)(VOLUME)); + printf("# Average speed: %8.3f Gflops/process #\n", + 1.0e-3*64.0*(double)(nb*NS*NS)/ahat_wdt); + printf("# Total throughput: %8.3f Gflops #\n", + 1.0e-3*64.0*(double)(NPROC*nb*NS*NS)/ahat_wdt); + printf("# #\n"); + printf("#########################################################\n\n"); + +/* fclose(flog); */ + } + +/* MPI_Finalize(); */ + + /* JuBE */ + /* call jube end function */ + jube_kernel_end(); + + return 0; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3_bgl_256.log b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3_bgl_256.log new file mode 100644 index 0000000000000000000000000000000000000000..842f56bbb55c3b5cabdb44b422310219d6673e07 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3_bgl_256.log @@ -0,0 +1,109 @@ + +QCD speed test +------------------------------- + +32x32x32x32 lattice, 4x4x4x4 process grid, 8x8x8x8 local lattice + +There are 256 MPI processes + + +Single-precision data and programs +------------------------------- + +The local size of the gauge field is 1152 KB +The local size of a quark field is 384 KB + +Program norm_square: +Time per lattice point: 0.093 micro sec (513 Mflops/process) + +Program mulc_spinor_add: +Time per lattice point: 0.220 micro sec (436 Mflops/process) + +Program Qhat: +Time per lattice point: 4.422 micro sec (428 Mflops/process) + + +Double-precision data and programs +------------------------------- + +The local size of the gauge field is 2304 KB +The local size of a quark field is 768 KB + +Program norm_square_dble: +Time per lattice point: 0.202 micro sec (237 Mflops/process) + +Program mulc_spinor_add_dble: +Time per lattice point: 0.447 micro sec (214 Mflops/process) + +Program Qhat_dble: +Time per lattice point: 7.542 micro sec (251 Mflops/process) + + +Timing of the Schwarz preconditioner +------------------------------------ + +bs = 4 4 4 4 +nmr = 4 +ncy = 5 + +The number of blocks per process is 16 +The local size of the gauge field is 1152 KB +The local size of a quark field is 384 KB +The size of the block gauge field is 72 KB +The size of a block quark field is 24 KB + +Time per lattice point: 67.19 micro sec (about 759 Mflops [32 bit arithmetic]) +Time per lattice point and MR iteration: 3.36 micro sec + + +Timing of Zhat +-------------- + +bs = 4 4 4 4 +Ns = 16 + +Number of points = 4096 +Number of blocks = 16 +Number of points/block = 256 +Vector field size = 2.05 KB +Zhat array size = 0.26 MB + +Time per application of Zhat, including communications: +Time per block: 163.805 micro sec (100 Mflops [32 bit arithmetic]) +Time per point: 0.640 micro sec + + +######################################################### +# # +# SYNTHETIC QCD SPEED TEST # +# # +# Using single-precision (32 bit) data and programs # +# # +# Time per lattice point: 9.691 micro sec # +# Average speed: 0.431 Gflops/process # +# Total throughput: 110.314 Gflops # +# # +# # +# Using double-precision (64 bit) data and programs # +# # +# Time per lattice point: 16.830 micro sec # +# Average speed: 0.248 Gflops/process # +# Total throughput: 63.521 Gflops # +# # +# # +# Using the Schwarz preconditioner [32 bit arithmetic] # +# # +# Time per lattice point: 67.192 micro sec # +# Average speed: 0.759 Gflops/process # +# Total throughput: 194.399 Gflops # +# # +# # +# Using deflation (little Dirac operator) # +# [32 bit arithmetic] # +# # +# Time per lattice point: 0.640 micro sec # +# Average speed: 0.100 Gflops/process # +# Total throughput: 25.605 Gflops # +# # +######################################################### + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3_bgl_single.log b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3_bgl_single.log new file mode 100644 index 0000000000000000000000000000000000000000..29c7a7907ffaa8dd396556df3f6920d97520626a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3_bgl_single.log @@ -0,0 +1,109 @@ + +QCD speed test +------------------------------- + +8x8x8x8 lattice, 1x1x1x1 process grid, 8x8x8x8 local lattice + +There is 1 MPI process + + +Single-precision data and programs +------------------------------- + +The local size of the gauge field is 1152 KB +The local size of a quark field is 384 KB + +Program norm_square: +Time per lattice point: 0.089 micro sec (539 Mflops/process) + +Program mulc_spinor_add: +Time per lattice point: 0.143 micro sec (669 Mflops/process) + +Program Qhat: +Time per lattice point: 1.899 micro sec (998 Mflops/process) + + +Double-precision data and programs +------------------------------- + +The local size of the gauge field is 2304 KB +The local size of a quark field is 768 KB + +Program norm_square_dble: +Time per lattice point: 0.193 micro sec (248 Mflops/process) + +Program mulc_spinor_add_dble: +Time per lattice point: 0.283 micro sec (339 Mflops/process) + +Program Qhat_dble: +Time per lattice point: 2.586 micro sec (733 Mflops/process) + + +Timing of the Schwarz preconditioner +------------------------------------ + +bs = 4 4 4 4 +nmr = 4 +ncy = 5 + +The number of blocks is 16 +The local size of the gauge field is 1152 KB +The local size of a quark field is 384 KB +The size of the block gauge field is 72 KB +The size of a block quark field is 24 KB + +Time per lattice point: 56.74 micro sec (about 899 Mflops [32 bit arithmetic]) +Time per lattice point and MR iteration: 2.84 micro sec + + +Timing of Zhat +-------------- + +bs = 4 4 4 4 +Ns = 16 + +Number of points = 4096 +Number of blocks = 16 +Number of points/block = 256 +Vector field size = 2.05 KB +Zhat array size = 0.26 MB + +Time per application of Zhat, including communications: +Time per block: 135.231 micro sec (121 Mflops [32 bit arithmetic]) +Time per point: 0.528 micro sec + + +######################################################### +# # +# SYNTHETIC QCD SPEED TEST # +# # +# Using single-precision (32 bit) data and programs # +# # +# Time per lattice point: 4.406 micro sec # +# Average speed: 0.948 Gflops/process # +# Total throughput: 0.948 Gflops # +# # +# # +# Using double-precision (64 bit) data and programs # +# # +# Time per lattice point: 6.406 micro sec # +# Average speed: 0.652 Gflops/process # +# Total throughput: 0.652 Gflops # +# # +# # +# Using the Schwarz preconditioner [32 bit arithmetic] # +# # +# Time per lattice point: 56.740 micro sec # +# Average speed: 0.899 Gflops/process # +# Total throughput: 0.899 Gflops # +# # +# # +# Using deflation (little Dirac operator) # +# [32 bit arithmetic] # +# # +# Time per lattice point: 0.528 micro sec # +# Average speed: 0.121 Gflops/process # +# Total throughput: 0.121 Gflops # +# # +######################################################### + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3_intel_single.log b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3_intel_single.log new file mode 100644 index 0000000000000000000000000000000000000000..3ae12fc1db50a1977e7dee93e8df8c07f53ecd1c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3_intel_single.log @@ -0,0 +1,111 @@ + +QCD speed test +------------------------------- + +8x8x8x8 lattice, 1x1x1x1 process grid, 8x8x8x8 local lattice + +There is 1 MPI process +Using inline assembly SSE3 instructions +Assuming SSE prefetch instructions fetch 128 bytes + + +Single-precision data and programs +------------------------------- + +The local size of the gauge field is 1152 KB +The local size of a quark field is 384 KB + +Program norm_square: +Time per lattice point: 0.030 micro sec (1587 Mflops/process) + +Program mulc_spinor_add: +Time per lattice point: 0.088 micro sec (1088 Mflops/process) + +Program Qhat: +Time per lattice point: 0.515 micro sec (3683 Mflops/process) + + +Double-precision data and programs +------------------------------- + +The local size of the gauge field is 2304 KB +The local size of a quark field is 768 KB + +Program norm_square_dble: +Time per lattice point: 0.052 micro sec (919 Mflops/process) + +Program mulc_spinor_add_dble: +Time per lattice point: 0.164 micro sec (584 Mflops/process) + +Program Qhat_dble: +Time per lattice point: 0.819 micro sec (2315 Mflops/process) + + +Timing of the Schwarz preconditioner +------------------------------------ + +bs = 4 4 4 4 +nmr = 4 +ncy = 5 + +The number of blocks is 16 +The local size of the gauge field is 1152 KB +The local size of a quark field is 384 KB +The size of the block gauge field is 72 KB +The size of a block quark field is 24 KB + +Time per lattice point: 15.41 micro sec (about 3311 Mflops [32 bit arithmetic]) +Time per lattice point and MR iteration: 0.77 micro sec + + +Timing of Zhat +-------------- + +bs = 4 4 4 4 +Ns = 16 + +Number of points = 4096 +Number of blocks = 16 +Number of points/block = 256 +Vector field size = 2.05 KB +Zhat array size = 0.26 MB + +Time per application of Zhat, including communications: +Time per block: 3.845 micro sec (4260 Mflops [32 bit arithmetic]) +Time per point: 0.015 micro sec + + +######################################################### +# # +# SYNTHETIC QCD SPEED TEST # +# # +# Using single-precision (32 bit) data and programs # +# # +# Time per lattice point: 1.354 micro sec # +# Average speed: 3.083 Gflops/process # +# Total throughput: 3.083 Gflops # +# # +# # +# Using double-precision (64 bit) data and programs # +# # +# Time per lattice point: 2.235 micro sec # +# Average speed: 1.869 Gflops/process # +# Total throughput: 1.869 Gflops # +# # +# # +# Using the Schwarz preconditioner [32 bit arithmetic] # +# # +# Time per lattice point: 15.407 micro sec # +# Average speed: 3.312 Gflops/process # +# Total throughput: 3.312 Gflops # +# # +# # +# Using deflation (little Dirac operator) # +# [32 bit arithmetic] # +# # +# Time per lattice point: 0.015 micro sec # +# Average speed: 4.261 Gflops/process # +# Total throughput: 4.261 Gflops # +# # +######################################################### + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3test.c b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3test.c new file mode 100644 index 0000000000000000000000000000000000000000..e076c175824d1c4a763d5b3119bc5fb8d51df1a6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3test.c @@ -0,0 +1,195 @@ + +/******************************************************************************* +* +* File time3test.c +* +* Copyright (C) 2008 Bjorn Leder, 2016 Jacob Finkenrath +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* QCD speed test +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "global.h" +#include "bm.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "archive.h" +#include "forces.h" +#include "update.h" +#include "version.h" +#include "sw_term.h" +#include "dirac.h" + +#if (((L0%4)!=0)||((L1%4)!=0)||((L2%4)!=0)||((L3%4)!=0)) +#error: The local lattice sizes must be a multiple of 4 +#endif + + + +int main(int argc,char *argv[]) +{ + int my_rank,nb; + /*int kernel_number=2;*/ + double cg_wdt[3],cgd_wdt[3],msap_wdt,ahat_wdt,wdt; + FILE *flog=NULL; + + /* JuBE */ + /* call jube initial function */ + /*jube_kernel_init(&kernel_number);*/ + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time3test.log","w",stdout); + error_root(flog==NULL,1,"main [time3test.c]","Unable to open log file"); + + flog = stdout; + + printf("\n"); + printf("QCD speed test\n"); + printf("-------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + if (NPROC>1) + printf("There are %d MPI processes\n",NPROC); + else + printf("There is 1 MPI process\n"); + +#if (defined SSE3) + printf("Using inline assembly SSE3 instructions\n"); +#elif (defined SSE2) + printf("Using inline assembly SSE2 instructions\n"); +#elif (defined SSE) + printf("Using inline assembly SSE instructions\n"); +#endif + +#if (defined SSE) +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + + printf("\n"); + } + + /* JuBE */ + /* call jube run function + jube_kernel_run();*/ + + time_cg_iter(flog,cg_wdt); + time_cg_iter_dble(flog,cgd_wdt); + time_msap(flog,&msap_wdt); + time_Awhat(flog,&ahat_wdt,&nb); + + /* JuBE */ + /* call jube finalize function + jube_kernel_finalize();*/ + + wdt=2.0*cg_wdt[0]+3.0*cg_wdt[1]+2.0*cg_wdt[2]; + + if (my_rank==0) + { + printf("\n"); + printf("#########################################################\n"); + printf("# #\n"); + printf("# SYNTHETIC QCD SPEED TEST #\n"); + printf("# #\n"); + printf("# Using single-precision (%d bit) data and programs #\n", + 8*(int)(sizeof(float))); + printf("# #\n"); + printf("# Time per lattice point: %8.3f micro sec #\n", + wdt); + printf("# Average speed: %8.3f Gflops/process #\n", + 1.0e-3*4200.0/wdt); + printf("# Total throughput: %8.3f Gflops #\n", + 1.0e-3*(double)(NPROC)*4200.0/wdt); + printf("# #\n"); + } + + wdt=2.0*cgd_wdt[0]+3.0*cgd_wdt[1]+2.0*cgd_wdt[2]; + + if (my_rank==0) + { + printf("# #\n"); + printf("# Using double-precision (%d bit) data and programs #\n", + 8*(int)(sizeof(double))); + printf("# #\n"); + printf("# Time per lattice point: %8.3f micro sec #\n", + wdt); + printf("# Average speed: %8.3f Gflops/process #\n", + 1.0e-3*4200.0/wdt); + printf("# Total throughput: %8.3f Gflops #\n", + 1.0e-3*(double)(NPROC)*4200.0/wdt); + printf("# #\n"); + } + + + if (my_rank==0) + { + printf("# #\n"); + printf("# Using the Schwarz preconditioner [%d bit arithmetic] #\n", + 8*(int)(sizeof(float))); + printf("# #\n"); + printf("# Time per lattice point: %8.3f micro sec #\n", + (double)(NCY)*(msap_wdt)); + printf("# Average speed: %8.3f Gflops/process #\n", + 1.0e-3*((double)((NMR+1)*2076+48)+112.0*2.0)/msap_wdt); + + printf("# Total throughput: %8.3f Gflops #\n", + 1.0e-3*NPROC*((double)((NMR+1)*2076+48)+112.0*2.0)/msap_wdt); + + printf("# #\n"); + } + + if (my_rank==0) + { + printf("# #\n"); + printf("# Using deflation (little Dirac operator) #\n"); + printf("# [%d bit arithmetic] #\n", + (int)(4*sizeof(complex))); + printf("# #\n"); + printf("# Time per lattice point: %8.3f micro sec #\n", + ahat_wdt/(double)(VOLUME)); + printf("# Average speed: %8.3f Gflops/process #\n", + 1.0e-3*64.0*(double)(nb*NS*NS)/ahat_wdt); + printf("# Total throughput: %8.3f Gflops #\n", + 1.0e-3*64.0*(double)(NPROC*nb*NS*NS)/ahat_wdt); + printf("# #\n"); + printf("#########################################################\n\n"); + +/* fclose(flog); */ + } + + MPI_Finalize(); + + /* JuBE */ + /* call jube end function + jube_kernel_end();*/ + + return 0; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3test.log b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3test.log new file mode 100644 index 0000000000000000000000000000000000000000..889d4340aa4b502ef7b85adce660fe28fed4bc60 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3test.log @@ -0,0 +1,144 @@ + +QCD speed test +------------------------------- + +16x8x8x8 lattice, 2x1x1x1 process grid, 8x8x8x8 local lattice + +There are 2 MPI processes + + +Single-precision data and programs +------------------------------- + +The local size of the gauge field is 1152 KB +The local size of a quark field is 384 KB + +Lattice parameters: +beta = 5.5 +c0 = 1.0, c1 = 0.0 +csw = 1.978 + +Program norm_square: +Time per lattice point: 0.019 micro sec (2471 Mflops/process) + +Program mulc_spinor_add: +Time per lattice point: 0.056 micro sec (1700 Mflops/process) + +Program Dhat: +Time per lattice point: 0.206 micro sec (9283 Mflops/process) + + +Double-precision data and programs +------------------------------- + +The local size of the gauge field is 2304 KB +The local size of a quark field is 768 KB + +Lattice parameters: +beta = 5.5 +c0 = 1.0, c1 = 0.0 +csw = 1.978 + +Program norm_square_dble: +Time per lattice point: 0.037 micro sec (1288 Mflops/process) + +Program mulc_spinor_add_dble: +Time per lattice point: 0.111 micro sec (867 Mflops/process) + +Program Dhat_dble: +Time per lattice point: 0.393 micro sec (4850 Mflops/process) + + +Timing of the SAP preconditioner +-------------------------------- + +16x8x8x8 lattice, 2x1x1x1 process grid, 8x8x8x8 local lattice + +Using SSE3 instructions and 16 xmm registers +Assuming SSE prefetch instructions fetch 64 bytes + +bs = 4 4 4 4 +ncy = 5 +nmr = 4 + +Lattice parameters: +beta = 5.5 +c0 = 1.0, c1 = 0.0 +csw = 1.978 + +Periodic boundary conditions + +Using the MinRes block solver: +Time per lattice point: 4.053 micro sec +Time per point and cycle: 0.811 micro sec (about 11439 Mflops) + +rbb is 2.000000 +Using the even-odd preconditioned MinRes block solver: +Time per lattice point: 4.566 micro sec +Time per point and cycle: 0.913 micro sec (about 11664 Mflops) + + +Timing of Awhat() +----------------- + +16x8x8x8 lattice, 2x1x1x1 process grid, 8x8x8x8 local lattice + +Lattice parameters: +beta = 5.5 +c0 = 1.0, c1 = 0.0 +csw = 1.978 + +Periodic boundary conditions + +Number of points = 4096 +Number of blocks = 16 +Number of points/block = 256 +Vector field size = 2.05 KB +Awhat array size = 0.26 MB + +Time per application of Awhat(), including communications: +Total: 0.020 msec +Per block: 1.274 usec (12855 Mflops [32 bit arithmetic]) +Per point: 0.005 usec + +There are 16 boundary blocks +Time per application of Awhat() for the communications: +Total: 0.004 msec +Per block: 0.231 usec +Per point: 0.001 usec + + +######################################################### +# # +# SYNTHETIC QCD SPEED TEST # +# # +# Using single-precision (32 bit) data and programs # +# # +# Time per lattice point: 0.619 micro sec # +# Average speed: 6.783 Gflops/process # +# Total throughput: 13.566 Gflops # +# # +# # +# Using double-precision (64 bit) data and programs # +# # +# Time per lattice point: 1.193 micro sec # +# Average speed: 3.520 Gflops/process # +# Total throughput: 7.040 Gflops # +# # +# # +# Using the Schwarz preconditioner [32 bit arithmetic] # +# # +# Time per lattice point: 4.566 micro sec # +# Average speed: 11.664 Gflops/process # +# Total throughput: 23.328 Gflops # +# # +# # +# Using deflation (little Dirac operator) # +# [32 bit arithmetic] # +# # +# Time per lattice point: 0.005 micro sec # +# Average speed: 12.856 Gflops/process # +# Total throughput: 25.711 Gflops # +# # +######################################################### + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/modules/bm/cg_iter.c b/qcd/part_cpu/applications/QCD/src/kernel_C/modules/bm/cg_iter.c new file mode 100644 index 0000000000000000000000000000000000000000..056be8f54a2539a1abfc7bdae54f9453821cb811 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/modules/bm/cg_iter.c @@ -0,0 +1,276 @@ +/******************************************************************************* +* +* File cg_iter.c +* +* Copyright (C) 2008 Bjorn Leder 2016 Jacob Finkenrath +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Based on QCDpbm-1.1 (http://luscher.web.cern.ch/luscher/QCDpbm/index.html) +* +*******************************************************************************/ + +#define CG_ITER_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "su3fcts.h" +#include "random.h" +#include "lattice.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "sflds.h" +#include "flags.h" +#include "uflds.h" +#include "utils.h" +#include "global.h" + +spinor **ps; + +static double wt_norm_square(int nflds) +{ + int my_rank,nmax,n,i,ib; + /*float r;*/ + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)(nmax*nflds); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_mulc_spinor_add(int nflds) +{ + int my_rank,nmax,n,i,ib; + complex z; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + z.re=0.123f; + z.im=0.456f; + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)((nmax*nflds)/2); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_Qhat(int nflds) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)((nmax*nflds)/2); + } + + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +void time_cg_iter(FILE *flog, double *wdt) +{ + int my_rank,nflds,n; + double phi[2]; + double wdt0,wdt1,wdt2; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + printf("\n"); + printf("Single-precision data and programs\n"); + printf("-------------------------------\n\n"); + + if ((VOLUME*sizeof(float))<(64*1024)) + { + printf("The local size of the gauge field is %d KB\n", + (int)((72*VOLUME*sizeof(float))/(1024))); + printf("The local size of a quark field is %d KB\n", + (int)((24*VOLUME*sizeof(float))/(1024))); + } + else + { + printf("The local size of the gauge field is %d MB\n", + (int)((72*VOLUME*sizeof(float))/(1024*1024))); + printf("The local size of a quark field is %d MB\n", + (int)((24*VOLUME*sizeof(float))/(1024*1024))); + } + + printf("\n"); + } + + start_ranlux(0,12); + phi[0]=0.0; + phi[1]=0.0; + set_bc_parms(3,1.0,1.0,1.0,1.0,phi,phi); + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + set_sw_parms(-0.0123); + geometry(); + /*alloc_u(); + alloc_ud(); + alloc_sw(); + alloc_swd();*/ + + random_ud(); + chs_ubnd(-1); + assign_ud2u(); + + sw_term(ODD_PTS); + /*error(invert_swd(ODD_PTS)!=0,1,"main [time1.c]", + "Inversion of swd on the odd sites was not safe");*/ + assign_swd2sw(); + + nflds=(int)((4*1024*1024)/(VOLUME*sizeof(float)))+1; + if ((nflds%2)==1) + nflds+=1; + alloc_ws(nflds); + + ps=reserve_ws(nflds); + for (n=0;n +#include +#include +#include "mpi.h" +#include "su3.h" +#include "su3fcts.h" +#include "random.h" +#include "lattice.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "sflds.h" +#include "flags.h" +#include "uflds.h" +#include "utils.h" +#include "global.h" + +spinor_dble **psd; + +static double wt_norm_square(int nflds) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)(nmax*nflds); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_mulc_spinor_add(int nflds) +{ + int my_rank,nmax,n,i,ib; + complex_dble z; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + z.re=0.123; + z.im=0.456; + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)((nmax*nflds)/2); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_Qhat(int nflds) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)((nmax*nflds)/2); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +void time_cg_iter_dble(FILE *flog, double *wdt) +{ + int my_rank,nflds,n; + double phi[2]; + double wdt0,wdt1,wdt2; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + printf("\n"); + printf("Double-precision data and programs\n"); + printf("-------------------------------\n\n"); + + if ((VOLUME*sizeof(double))<(64*1024)) + { + printf("The local size of the gauge field is %d KB\n", + (int)((72*VOLUME*sizeof(double))/(1024))); + printf("The local size of a quark field is %d KB\n", + (int)((24*VOLUME*sizeof(double))/(1024))); + } + else + { + printf("The local size of the gauge field is %d MB\n", + (int)((72*VOLUME*sizeof(double))/(1024*1024))); + printf("The local size of a quark field is %d MB\n", + (int)((24*VOLUME*sizeof(double))/(1024*1024))); + } + + printf("\n"); + } + + start_ranlux(0,12); + phi[0]=0.0; + phi[1]=0.0; + set_bc_parms(3,1.0,1.0,1.0,1.0,phi,phi); + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + set_sw_parms(-0.0123); + + geometry(); + /*alloc_ud(); + alloc_swd();*/ + + random_ud(); + chs_ubnd(-1); + sw_term(ODD_PTS); + /*error(invert_swd(ODD_PTS)!=0,1,"main [time2.c]", + "Inversion of swd on the odd sites was not safe");*/ + + nflds=(int)((4*1024*1024)/(VOLUME*sizeof(double)))+1; + if ((nflds%2)==1) + nflds+=1; + alloc_wsd(nflds); + + psd=reserve_wsd(nflds); + for (n=0;n +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "vflds.h" +#include "linalg.h" +#include "dirac.h" +#include "dfl.h" +#include "little.h" +#include "global.h" +#include "bm.h" + +static int bs[4]={4,4,4,4}; + +static void random_basis(int Ns) +{ + int i; + spinor **ws; + + ws=reserve_ws(Ns); + + for (i=0;i1) + { + nt/=2; + if (nt==0) + nt=1; + wt=0.0; + + while (wt<5.0) + { + for (i=0;i +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "sap.h" +#include "global.h" +#include "bm.h" + +static int bs[4]={4,4,4,4}; + +void time_msap(FILE *flog, double *wdt) +{ + int my_rank,bc,count,nt; + int ncy,nmr; + int n,ie; + float mu; + double phi[2],phi_prime[2]; + double rbb,wt1,wt2; + spinor **ps; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + if (my_rank==0) + { + + printf("\n"); + printf("Timing of the SAP preconditioner\n"); + printf("--------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + +#if (defined x64) +#if (defined AVX) + printf("Using AVX instructions\n"); +#else + printf("Using SSE3 instructions and 16 xmm registers\n"); +#endif +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + printf("\n"); + + printf("bs = %d %d %d %d\n",bs[0],bs[1],bs[2],bs[3]); + printf("ncy = %d\n",NCY); + printf("nmr = %d\n\n",NMR); + + + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + ncy=NCY; + nmr=NMR; + bc=3; + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12); + geometry(); + alloc_ws(3); + set_sap_parms(bs,0,1,1); + alloc_bgr(SAP_BLOCKS); + + set_sw_parms(0.0123); + mu=0.0785f; + rbb=2.0*(1.0/(double)(bs[0])+1.0/(double)(bs[1])+ + 1.0/(double)(bs[2])+1.0/(double)(bs[3])); + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + assign_ud2ubgr(SAP_BLOCKS); + assign_swd2swbgr(SAP_BLOCKS,NO_PTS); + + ps=reserve_ws(3); + random_s(VOLUME,ps[2],1.0f); + bnd_s2zero(ALL_PTS,ps[2]); + normalize(VOLUME,1,ps[2]); + + nt=(int)(2.0e6/(double)(ncy*nmr*VOLUME)); + if (nt<2) + nt=2; + (*wdt)=0.0; + + while ((*wdt)<5.0) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + for (count=0;count that allows the type of boundary condition to + be specified at run time. + +- Corrected a bug in Dwee_dble() [modules/dirac/Dw_dbl.c] that shows up in + some check programs if none of the local lattice sizes L1,L2,L3 is divisible + by 4. The functionality of the other modules and the main programs in ./main + was not affected by this bug, because Dwee_dble() is not called in any of + these programs. + +- Corrected modules/flags/rw_parms.c so as to allow for Hasenbusch factorized + reweighting factors. + +- Corrected and improved the descriptions at the top of many module files. + +- Corrected devel/ratfcts/INDEX. + +- Added forgotten "plots" directory in devel/nompi/main. + +- Replaced &irat in MPI_Bcast(&irat,3,MPI_INT,0,MPI_COMM_WORLD) by irat in + flags/force_parms.c [read_forc_parms() and read_force_parms2()]. This is not + a mistake but an unnatural and unintended use of the C language. Corrected + analogous cases in a number of check programs (thanks to Hubert Simma and + Georg Engel for noting these misprints). + +- Corrected check program block/check1.c (the point labeling does not need to + respect any time ordering). + + +12. May 2013 + +Version 1.2: 2nd public release. + +- Added AVX inline-assembly to the time-critical functions (Dirac operator, + linear algebra, SAP preconditioner, SU(3) functions). See the README file in + the top directory of the distribution. + +- Added support for blocked MPI process ranking, as is likely to be profitable + on parallel computers with mult-core nodes (see main/README.global). + +- Made the field import/export functions more efficient by avoiding the + previously excessive use of MPI_Barrier(). + +- Added import/export functions for the state of the random number generators. + Modified the initialization of the generators so as to be independent of the + ranking of the MPI processes. See the notes in modules/random/ranlux.c. Added + a check program in devel/random. + +- Continuation runs of qcd1,qcd2,ym1 and ms1 now normally reset the random + number generators to their state at the end of the previous run. The + programs initialize the generators in the traditional way if the option + -norng is set (see README.qcd1, for example). + +- Modified the deflated SAP+GCR solver (dfl/dfl_sap_gcr.c) by replacing the + deflation projectors through an inaccurate projection in the preconditioner + (as suggested by Frommer et al. [arXiv:1303:1377]; the deflation subspace + type and subspace generation algorithm are unchanged). This leads to a + structural simplification and, after some parameter tuning, to a slight + performance gain. NOTE: the deflation parameter set is changed too and the + number of status variables is reduced by 1 (see modules/flags/dfl_parms.c, + modules/dfl/dfl_sap_gcr.c and doc/parms.pdf). + +- Included a program (devel/dfl/check4.c) that allows the parameters of the + deflated SAP+GCR solver to be tuned on a given lattice. + +- Deleted the now superfluous module/dfl/dfl_projectors.c. + +- Added the function fdigits() [utils/mutils.c] that allows double-precision + floating point numbers to be printed with all significant decimal digits + (and only these). The main programs make use of this function to ensure that + the values of the decimal parameters are printed to the log files with as + many significant digits as were given on the input parameter file (assuming + not more digits were specified than can be represented by a double number). + +- Replaced "if" by "else if" on line 379 of main/ms2.c. This bug stopped the + program with an error message when the CGNE solver was used. It had no + effect when other solvers were used. + +- Changed the type of the variable "sf" to "int" in lines 257 and 440 of + forces/force0.c. This bug had no effect in view of the automatic type + conversions performed by the compiler. + +- Corrected sign in line 174 of devel/sap/check2.c. This bug led to wrong + check results, thus incorrectly suggesting that the SAP modules were + incorrect. + +- Corrected a mistake in devel/tcharge/check2.c and devel/tcharge/check5.c + that gave rise to wrong results suggesting that the tested modules were + incorrect. + + +14. June 2012 + +Version 1.0: Initial public release. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/COPYING b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/COPYING new file mode 100644 index 0000000000000000000000000000000000000000..7a8e8abfd0057f374fbf59076c263f1f5d685b73 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/COPYING @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..5da2eb95bd0d02d0123feba6c759b3522e3d9fdd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/INDEX @@ -0,0 +1,19 @@ + +At the top level, the openQCD package is structured as follows: + +devel Directories used for developing and testing the various + modules + +doc Documentation files + +include All include files. Typically there is one include file + per directory in the modules directory + +main Collection of main programs + +modules Source code of all modules + +In addition to the information provided by the notes in the doc directory, +short descriptions of the program functionalities are included in the source +directories and at the top of each program file. Further information is found +in various README files (such as main/README.global). diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/README new file mode 100644 index 0000000000000000000000000000000000000000..74cbe5541e6d8ed24f8f52ec785c8699e722b9d3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/README @@ -0,0 +1,244 @@ + +******************************************************************************** + + openQCD Simulation Program + +******************************************************************************** + + +LATTICE THEORY + +Currently the common features of the supported lattice theories are the +following: + +* 4-dimensional hypercubic N0xN1xN2xN3 lattice with even sizes N0,N1,N2,N3. + Open, Schrödinger functional (SF), open-SF or periodic boundary conditions + in the time direction and periodic boundary conditions in the space + directions. + +* SU(3) gauge group, plaquette plus planar double-plaquette gauge action + (Wilson, Symanzik, Iwasaki,...). + +* O(a)-improved Wilson quarks in the fundamental representation of the gauge + group. Among the supported quark multiplets are the classical ones (pure + gauge, two-flavour theory, 2+1 and 2+1+1 flavour QCD), but doublets with a + twisted mass and theories with many doublets, for example, are also + supported. + +The O(a)-improvement includes the boundary counterterms required for the +improvement of the correlation functions near the boundaries of the lattice in +the time direction if open, SF or open-SF boundary conditions are chosen. + + +SIMULATION ALGORITHM + +The simulation program is based on the HMC algorithm. For the heavier quarks, +a version of the RHMC algorithm is used. Several advanced techniques are +implemented that can be configured at run time: + +* Nested hierarchical integrators for the molecular-dynamics equations, based + on any combination of the leapfrog, 2nd order Omelyan-Mryglod-Folk (OMF) and + 4th order OMF elementary integrators, are supported. + +* Twisted-mass Hasenbusch frequency splitting, with any number of factors + and twisted masses. Optionally with even-odd preconditioning. + +* Twisted-mass determinant reweighting. + +* Deflation acceleration and chronological solver along the molecular-dynamics + trajectories. + +* A choice of solvers (CGNE, MSCG, SAP+GCR, deflated SAP+GCR) for the Dirac + equation, separately configurable for each force component and + pseudo-fermion action. + +All of these depend on a number of parameters, whose values are passed to the +simulation program together with those of the action parameters (coupling +constants, quark masses, etc.) through a structured input parameter file. + + +PROGRAM FEATURES + +All programs parallelize in 0,1,2,3 or 4 dimensions, depending on what is +specified at compilation time. They are highly optimized for machines with +current Intel or AMD processors, but will run correctly on any system that +complies with the ISO C89 (formerly ANSI C) and the MPI 1.2 standards. + +For the purpose of testing and code development, the programs can also +be run on a desktop or laptop computer. All what is needed for this is +a compliant C compiler and a local MPI installation such as Open MPI. + + +DOCUMENTATION + +The simulation program has a modular form, with strict prototyping and a +minimal use of external variables. Each program file contains a small number +of externally accessible functions whose functionality is described at the top +of the file. + +The data layout is explained in various README files and detailed instructions +are given on how to run the main programs. A set of further documentation +files are included in the doc directory, where the normalization conventions, +the chosen algorithms and other important program elements are described. + + +COMPILATION + +The compilation of the programs requires an ISO C89 compliant compiler and a +compatible MPI installation that complies with the MPI standard 1.2 (or later). + +In the main and devel directories, a GNU-style Makefile is included which +compiles and links the programs (just type "make" to compile everything; "make +clean" removes the files generated by "make"). The compiler options can be set +by editing the CFLAGS line in the Makefiles. + +The Makefiles assume that the following environment variables are set: + + GCC GNU C compiler command [Example: /usr/bin/gcc]. + + MPI_HOME MPI home directory [Example: /usr/lib64/mpi/gcc/openmpi]. + The mpicc command used is the one in $MPI_HOME/mpicc and + the MPI libraries are expected in $MPI_HOME/lib. + + MPI_INCLUDE Directory where the mpi.h file is to be found. + +All programs are then compiled using the $MPI_HOME/bin/mpicc command. The +compiler options that can be set in the CFLAGS line depend on which C compiler +the mpicc command invokes (the GCC compiler command is only used to resolve +the dependencies on the include files). + + +SSE/AVX ACCELERATION + +Current Intel and AMD processors are able to perform arithmetic operations on +short vectors of floating-point numbers in just one or two machine cycles, +using SSE and/or AVX instructions. The arithmetic performed by these +instructions fully complies with the IEEE 754 standard. + +Many programs in the module directories include SSE and AVX inline-assembly +code. On 64bit systems, and if the GNU or Intel C compiler is used, the code +can be activated by setting the compiler flags -Dx64 and -DAVX, respectively. +In addition, SSE prefetch instructions will be used if one of the following +options is specified: + + -DP4 Assume that prefetch instructions fetch 128 bytes at a time + (Pentium 4 and related Xeons). + + -DPM Assume that prefetch instructions fetch 64 bytes at a time + (Athlon, Opteron, Pentium M, Core, Core 2 and related Xeons). + + -DP3 Assume that prefetch instructions fetch 32 bytes at a time + (Pentium III). + +These options have an effect only if -Dx64 or -DAVX is set. The option +-DAVX implies -Dx64. + +On recent x86-64 machines with AMD Opteron or Intel Xeon processors, for +example, the recommended compiler flags are + + -std=c89 -O -mno-avx -DAVX -DPM + +For older machines that do not support the AVX instruction set, the +recommended flags are + + -std=c89 -O -mno-avx -Dx64 -DPM + +More aggressive optimization levels such as -O2 and -O3 tend to have little +effect on the execution speed of the programs, but the risk of generating +wrong code is higher. + +AVX instructions and the option -mno-avx may not be known to old versions +of the compilers, in which case one is limited to SSE accelerations with +option string -std=c89 -O -Dx64 -DPM. + + +DEBUGGING FLAGS + +For troubleshooting and parameter tuning, it may helpful to switch on some +debugging flags at compilation time. The simulation program then prints a +detailed report to the log file on the progress made in specified subprogram. + +The available flags are: + +-DCGNE_DBG CGNE solver. + +-DFGCR_DBG GCR solver. + +-FGCR4VD_DBG GCR solver for the little Dirac equation. + +-DMSCG_DBG MSCG solver. + +-DDFL_MODES_DBG Deflation subspace generation. + +-DMDINT_DBG Integration of the molecular-dynamics equations. + +-DRWRAT_DBG Computation of the rational function reweighting + factor. + + +RUNNING A SIMULATION + +The simulation programs reside in the directory "main". For each program, +there is a README file in this directory which describes the program +functionality and its parameters. + +Running a simulation for the first time requires its parameters to be chosen, +which tends to be a non-trivial task. The syntax of the input parameter files +and the meaning of the various parameters is described in some detail in +main/README.infiles and doc/parms.pdf. Examples of valid parameter files are +contained in the directory main/examples. + + +EXPORTED FIELD FORMAT + +The field configurations generated in the course of a simulation are written +to disk in a machine-independent format (see modules/misc/archive.c). +Independently of the machine endianness, the fields are written in little +endian format. A byte-reordering is therefore not required when machines with +different endianness are used for the simulation and the physics analysis. + + +AUTHORS + +The initial release of the openQCD package was written by Martin Lüscher and +Stefan Schaefer. Support for Schrödinger functional boundary conditions was +added by John Bulava. Several modules were taken over from the DD-HMC program +tree, which includes contributions from Luigi Del Debbio, Leonardo Giusti, +Björn Leder and Filippo Palombi. + + +ACKNOWLEDGEMENTS + +In the course of the development of the openQCD code, many people suggested +corrections and improvements or tested preliminary versions of the programs. +The authors are particularly grateful to Isabel Campos, Dalibor Djukanovic, +Georg Engel, Leonardo Giusti, Björn Leder, Carlos Pena and Hubert Simma for +their communications and help. + + +LICENSE + +The software may be used under the terms of the GNU General Public Licence +(GPL). + + +BUG REPORTS + +If a bug is discovered, please send a report to . + + +ALTERNATIVE PACKAGES AND COMPLEMENTARY PROGRAMS + +There is a publicly available BG/Q version of openQCD that takes advantage of +the machine-specific features of IBM BlueGene/Q computers. The version is +available at . + +The openQCD programs currently do not support reweighting in the quark +masses, but a module providing this functionality can be downloaded from +. + +Previously generated gauge-field configurations are often used as initial +configuration for a new run. If the old and new lattices or boundary +conditions are not the same, the old configuration may however need to be +adapted, using a field conversion tool such as the one available at +, before the new run is started. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/README new file mode 100644 index 0000000000000000000000000000000000000000..4a03581d0fb9138e14869b976c4e5927bf1ce7d5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/README @@ -0,0 +1,11 @@ + +This directory contains the check programs for the various modules. +They should better be executed, for the chosen lattice parameters, +before a simulation is started. + +The check programs for some of the program files that do not use the +MPI library (such as the random number generators) are included in the +directory "nompi". All other directories correspond to the module +directories with the same name, and the check programs in these are +MPI main programs. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..7d59c07d48402babefb5a874acb37354e53586e0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/INDEX @@ -0,0 +1,19 @@ + +Saving/restoring field configurations to/from files. + +check1 Writing and reading of gauge configurations. + +check2 Exporting and importing gauge configurations. + +check3 Importing a previously exported configuration. + +check4 Writing and reading spinor fields. + +check5 Exporting and importing spinor fields. + +check6 Importing a previously exported spinor field. + +The programs check1,check2,check3 accept the option -bc that allows the +type of boundary condition to be chosen (open boundary conditions are assumed +if the option is not set). All other programs are insensitive to the boundary +conditions. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..0ecd1721ea0bdb18270d1ecb8054222cabaa38a4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/Makefile @@ -0,0 +1,134 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 check4 check5 check6 + +FLAGS = flags lat_parms hmc_parms dfl_parms + +LATTICE = bcnds uidx geometry + +ARCHIVE = archive sarchive + +LINALG = liealg cmatrix_dble salg_dble + +RANDOM = ranlux ranlxs ranlxd gauss + +UFLDS = plaq_sum uflds udcom + +SFLDS = sflds + +SU3FCTS = chexp cm3x3 random_su3 su3prod su3ren + +UTILS = endian mutils utils wspace + +MODULES = $(FLAGS) $(LATTICE) $(ARCHIVE) $(LINALG) $(RANDOM) $(UFLDS) \ + $(SFLDS) $(SU3FCTS) $(UTILS) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/sflds:\ + $(MDIR)/su3fcts:$(MDIR)/utils + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog \ + *.log~ *.dat *.dat~ $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..50f57fd51f3a377ec01a48ff731e78873302545e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check1.c @@ -0,0 +1,220 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2005, 2007, 2010, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Writing and reading gauge configurations. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "su3fcts.h" +#include "linalg.h" +#include "archive.h" +#include "global.h" + +static int *rlxs_state[2],*rlxd_state[2]; + + +static void save_ranlux(void) +{ + int nlxs,nlxd; + int *p; + + nlxs=rlxs_size(); + nlxd=rlxd_size(); + + p=malloc(2*(nlxs+nlxd)*sizeof(*p)); + error(p==NULL,1,"save_ranlux [check1.c]", + "Unable to allocate state arrays"); + rlxs_state[0]=p; + p+=nlxs; + rlxs_state[1]=p; + p+=nlxs; + rlxd_state[0]=p; + p+=nlxd; + rlxd_state[1]=p; + + rlxs_get(rlxs_state[0]); + rlxd_get(rlxd_state[0]); +} + + +static int check_ranlux(void) +{ + int nlxs,nlxd,k,ie; + + nlxs=rlxs_size(); + nlxd=rlxd_size(); + + rlxs_get(rlxs_state[1]); + rlxd_get(rlxd_state[1]); + ie=0; + + for (k=0;k]"); + } + + MPI_Bcast(loc_dir,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + alloc_wud(1); + + check_dir(loc_dir); + nsize=name_size("%s/testcnfg_%d",loc_dir,NPROC); + error_root(nsize>=NAME_SIZE,1,"main [check1.c]","loc_dir name is too long"); + sprintf(cnfg,"%s/testcnfg_%d",loc_dir,my_rank); + + if (my_rank==0) + { + printf("Write random field configuration to the files\n" + "%s/testcnfg_*\n" + "on the local disks.\n\n",loc_dir); + printf("Then read the field from there, compare with the saved field\n" + "and remove all files.\n\n"); + } + + usv=reserve_wud(1); + udb=udfld(); + + random_ud(); + cm3x3_assign(4*VOLUME,udb,usv[0]); + save_ranlux(); + write_cnfg(cnfg); + + random_ud(); + read_cnfg(cnfg); + remove(cnfg); + error_chk(); + + ie=(check_bc(0.0)^0x1); + ie|=check_ud(usv[0]); + error(ie!=0,1,"main [check1.c]","The gauge field is not properly restored"); + + ie=check_ranlux(); + error(ie!=0,1,"main [check1.c]", + "The random number generator is not properly restored"); + print_flags(); + + if (my_rank==0) + { + printf("No errors detected --- the fields are correctly written\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check1.in new file mode 100644 index 0000000000000000000000000000000000000000..7a91706372cf5afb3ac77e539f402ba3c1cd4428 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check1.in @@ -0,0 +1 @@ +loc_dir /home/data/openQCD/cnfg diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..e159214fe76fbc10f891581bb5a7351c5e7d3b6a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check2.c @@ -0,0 +1,166 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2005, 2007, 2010, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Exporting and importing gauge configurations. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "su3fcts.h" +#include "linalg.h" +#include "archive.h" +#include "global.h" + + +static int cmp_ud(su3_dble *u,su3_dble *v) +{ + int it; + + it =((*u).c11.re!=(*v).c11.re); + it|=((*u).c11.im!=(*v).c11.im); + it|=((*u).c12.re!=(*v).c12.re); + it|=((*u).c12.im!=(*v).c12.im); + it|=((*u).c13.re!=(*v).c13.re); + it|=((*u).c13.im!=(*v).c13.im); + + it|=((*u).c21.re!=(*v).c21.re); + it|=((*u).c21.im!=(*v).c21.im); + it|=((*u).c22.re!=(*v).c22.re); + it|=((*u).c22.im!=(*v).c22.im); + it|=((*u).c23.re!=(*v).c23.re); + it|=((*u).c23.im!=(*v).c23.im); + + it|=((*u).c31.re!=(*v).c31.re); + it|=((*u).c31.im!=(*v).c31.im); + it|=((*u).c32.re!=(*v).c32.re); + it|=((*u).c32.im!=(*v).c32.im); + it|=((*u).c33.re!=(*v).c33.re); + it|=((*u).c33.im!=(*v).c33.im); + + return it; +} + + +static int check_ud(su3_dble *usv) +{ + int it; + su3_dble *u,*um; + + u=udfld(); + um=u+4*VOLUME; + it=0; + + for (;u]"); + } + + MPI_Bcast(cnfg_dir,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + alloc_wud(1); + + check_dir_root(cnfg_dir); + nsize=name_size("%s/testcnfg",cnfg_dir); + error_root(nsize>=NAME_SIZE,1,"main [check2.c]","cnfg_dir name is too long"); + + if (my_rank==0) + { + printf("Export random field configurations to the file\n" + "%s/testcnfg.\n",cnfg_dir); + printf("Then read the fields from there and compare with the saved " + "fields.\n\n"); + } + + udb=udfld(); + usv=reserve_wud(1); + random_ud(); + cm3x3_assign(4*VOLUME,udb,usv[0]); + + sprintf(cnfg,"%s/testcnfg",cnfg_dir); + export_cnfg(cnfg); + + random_ud(); + import_cnfg(cnfg); + error_chk(); + + ie=(check_bc(0.0)^0x1); + ie|=check_ud(usv[0]); + error(ie!=0,1,"main [check2.c]","The gauge field is not properly restored"); + print_flags(); + + if (my_rank==0) + { + printf("No errors detected --- the fields are correctly exported\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check2.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check2.in new file mode 100644 index 0000000000000000000000000000000000000000..87f9119fb146db92fab20e244c40fcac21ee1274 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check2.in @@ -0,0 +1 @@ +cnfg_dir /home/data/openQCD/cnfg diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..741c225b346f248a2350655363d16dc3bb631d61 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check3.c @@ -0,0 +1,141 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2005, 2007, 2008, 2010, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Importing a configuration previously exported by check2. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "su3fcts.h" +#include "linalg.h" +#include "archive.h" +#include "global.h" + + +static double avg_plaq(void) +{ + double plaq; + + plaq=plaq_sum_dble(1); + + return plaq/((double)(6*NPROC)*(double)(VOLUME)); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,nsize,ir,ie; + stdint_t l[4]; + double phi[2],phi_prime[2]; + double plaq0,plaq1,plaq2; + char cnfg_dir[NAME_SIZE],cnfg[NAME_SIZE]; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check3.log","w",stdout); + fin=freopen("check3.in","r",stdin); + + printf("\n"); + printf("Importing gauge fields exported by check2\n"); + printf("-----------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("cnfg_dir","%s\n",cnfg_dir); + fclose(fin); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check3.c]", + "Syntax: check3 [-bc ]"); + } + + MPI_Bcast(cnfg_dir,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,9876); + geometry(); + random_ud(); + plaq0=avg_plaq(); + + check_dir_root(cnfg_dir); + nsize=name_size("%s/testcnfg",cnfg_dir); + error_root(nsize>=NAME_SIZE,1,"main [check3.c]","cnfg_dir name is too long"); + sprintf(cnfg,"%s/testcnfg",cnfg_dir); + + if (my_rank==0) + { + fin=fopen(cnfg,"rb"); + error_root(fin==NULL,1,"main [check3.c]","Unable to open input file"); + + ir=fread(l,sizeof(stdint_t),4,fin); + ir+=fread(&plaq1,sizeof(double),1,fin); + error_root(ir!=5,1,"main [check3.c]","Incorrect read count"); + fclose(fin); + + if (endianness()==BIG_ENDIAN) + { + bswap_int(4,l); + bswap_double(1,&plaq1); + } + + printf("Random gauge field, average plaquette = %.15e\n\n",plaq0); + printf("Now read gauge field from file\n" + "%s:\n",cnfg); + printf("%dx%dx%dx%d lattice\n", + (int)(l[0]),(int)(l[1]),(int)(l[2]),(int)(l[3])); + printf("Average plaquette = %.15e\n",plaq1); + } + + import_cnfg(cnfg); + ie=check_bc(0.0); + plaq2=avg_plaq(); + error_chk(); + error(ie!=1,1,"main [check3.c]","Boundary conditions are not preserved"); + + if (my_rank==0) + { + printf("Should be = %.15e\n\n",plaq2); + remove(cnfg); + } + + print_flags(); + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check3.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check3.in new file mode 100644 index 0000000000000000000000000000000000000000..ddffa2380f2cafe172782e07bb0f9c6a24134554 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check3.in @@ -0,0 +1 @@ +cnfg_dir /home/data/openQCD/cnfg diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check4.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check4.c new file mode 100644 index 0000000000000000000000000000000000000000..cc7049ec847bfc31903eac0ce27e4ed0217ea066 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check4.c @@ -0,0 +1,112 @@ + +/******************************************************************************* +* +* File check4.c +* +* Copyright (C) 2007, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Writing and reading spinor fields. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "linalg.h" +#include "archive.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,nsize,k; + double d,dmax; + spinor_dble **psd; + char loc_dir[NAME_SIZE],name[NAME_SIZE]; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check4.log","w",stdout); + fin=freopen("check4.in","r",stdin); + + printf("\n"); + printf("Writing and reading spinor fields\n"); + printf("---------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("loc_dir","%s\n",loc_dir); + fclose(fin); + } + + MPI_Bcast(loc_dir,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + + start_ranlux(0,123456); + geometry(); + alloc_wsd(6); + psd=reserve_wsd(6); + + check_dir(loc_dir); + nsize=name_size("%s/testsfld_%d.%d",loc_dir,NPROC,6); + error_root(nsize>=NAME_SIZE,1,"main [check4.c]","loc_dir name is too long"); + + for (k=0;k<3;k++) + { + random_sd(VOLUME,psd[k],1.0); + sprintf(name,"%s/testsfld_%d.%d",loc_dir,my_rank,k); + write_sfld(name,psd[k]); + } + + for (k=0;k<3;k++) + { + sprintf(name,"%s/testsfld_%d.%d",loc_dir,my_rank,k); + read_sfld(name,psd[k+3]); + remove(name); + } + + dmax=0.0; + + for (k=0;k<3;k++) + { + mulr_spinor_add_dble(VOLUME,psd[k],psd[k+3],-1.0); + d=norm_square_dble(VOLUME,1,psd[k]); + + if (d>dmax) + dmax=d; + } + + error_chk(); + + if (my_rank==0) + { + printf("Wrote 3 spinor fields to the files\n" + "%s/testsfld_*\n" + "on the local disks. ",loc_dir); + printf("Then read the fields from there and removed\n" + "the files.\n\n"); + printf("Maximal deviation = %.1e ",sqrt(dmax)); + printf("(should be exactly equal to 0.0)\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check4.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check4.in new file mode 100644 index 0000000000000000000000000000000000000000..e4eb9a0f89f1afa33f7426c77c5479291f941b23 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check4.in @@ -0,0 +1 @@ +loc_dir /home/data/openQCD/scnfg diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check5.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check5.c new file mode 100644 index 0000000000000000000000000000000000000000..9d916277319e22d85b5faa0385ed055fd17c85ad --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check5.c @@ -0,0 +1,155 @@ + +/******************************************************************************* +* +* File check5.c +* +* Copyright (C) 2007, 2008, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Exporting and importing spinor fields. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "linalg.h" +#include "archive.h" +#include "global.h" + +static const spinor_dble sd0={{{0.0}}}; +static spinor_dble **psd; + + +static void ptfld(int k) +{ + int x0,x1,x2,x3,y0,y1,y2,y3,ix; + spinor_dble *s; + + y0=L0*cpr[0]; + y1=L1*cpr[1]; + y2=L2*cpr[2]; + y3=L3*cpr[3]; + + for (x0=0;x0=NAME_SIZE,1,"main [check5.c]","sfld_dir name is too long"); + + for (k=0;k<3;k++) + { + random_sd(VOLUME,psd[k],1.0); + sprintf(name,"%s/testsfld%d",sfld_dir,k); + export_sfld(name,psd[k]); + } + + for (k=0;k<3;k++) + { + sprintf(name,"%s/testsfld%d",sfld_dir,k); + import_sfld(name,psd[k+3]); + remove(name); + } + + dmax=0.0; + + for (k=0;k<3;k++) + { + mulr_spinor_add_dble(VOLUME,psd[k],psd[k+3],-1.0); + d=norm_square_dble(VOLUME,0,psd[k]); + + if (d>dmax) + dmax=d; + } + + error_chk(); + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Exported 3 spinor fields to the directory\n" + "%s\n",sfld_dir); + printf("Then reimported and deleted them\n\n"); + printf("Maximal deviation = %.1e ",sqrt(dmax)); + printf("(should be exactly equal to 0.0)\n\n"); + } + + ptfld(4); + sprintf(name,"%s/testsfld",sfld_dir); + export_sfld(name,psd[4]); + + if (my_rank==0) + { + printf("Point source field exported to file\n" + "%s\n\n",name); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check5.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check5.in new file mode 100644 index 0000000000000000000000000000000000000000..6a8c2bccbb02d4c50c1d1c6f2657af3cea1e87aa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check5.in @@ -0,0 +1 @@ +sfld_dir /home/data/openQCD/scnfg diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check6.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check6.c new file mode 100644 index 0000000000000000000000000000000000000000..6fa80bc399a34266f232af9b5b7bd8f4346304a7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check6.c @@ -0,0 +1,124 @@ + +/******************************************************************************* +* +* File check6.c +* +* Copyright (C) 2007, 2008, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Importing a previously exported spinor field. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "linalg.h" +#include "archive.h" +#include "global.h" + +static spinor_dble **psd; +static const spinor_dble sd0={{{0.0}}}; + + +static void ptfld(int k) +{ + int x0,x1,x2,x3,y0,y1,y2,y3,ix; + spinor_dble *s; + + y0=L0*cpr[0]; + y1=L1*cpr[1]; + y2=L2*cpr[2]; + y3=L3*cpr[3]; + + for (x0=0;x0=NAME_SIZE,1,"main [check6.c]","sfld_dir name is too long"); + sprintf(name,"%s/testsfld",sfld_dir); + + import_sfld(name,psd[1]); + + mulr_spinor_add_dble(VOLUME,psd[0],psd[1],-1.0); + d=norm_square_dble(VOLUME,1,psd[0]); + + error_chk(); + + if (my_rank==0) + { + printf("Imported field from file\n" + "%s\n\n",name); + printf("Deviation = %.1e ",sqrt(d)); + printf("(should be exactly equal to 0.0)\n\n"); + remove(name); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check6.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check6.in new file mode 100644 index 0000000000000000000000000000000000000000..db4406f006b28dc110bda1b66a39a5aebaf65eed --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check6.in @@ -0,0 +1 @@ +sfld_dir /home/data/openQCD/scnfg \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..ca84d1525e49de3d9f3545c0039b9e1e8b5eafd8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/INDEX @@ -0,0 +1,21 @@ + +Block grid allocation and field assignment programs + +check1 Checks on the local geometry arrays b.ipt,b.iup,b.idn, + b.imb,b.ibp and b.bb.ipp,b.bb.map,b.bb.imb for the + known block grids. This program also checks ipt_blk(). + +check2 Checks on the allocation and initialization of the gauge, + Dirac and Weyl fields on the known block grids. + +check3 Check of assign_ud2ubgr() and assign_ud2udblk(). + +check4 Check of assign_swd2swbgr() and assign_swd2swdblk(). + +check5 Check of assign_s2sblk(),...,assign_sdblk2sd(). + +The programs check1, check3 and check4 accept the option -bc that +allows the type of boundary condition to be chosen at runtime. When the option +is not set, open boundary conditions are assumed. + +The option may be set but has no effect in the case of the other programs. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..4c89cb557ddd1b02d70ed97ed4acbf52bd51b06e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/Makefile @@ -0,0 +1,139 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 check4 check5 + +FLAGS = flags lat_parms sap_parms dfl_parms + +RANDOM = ranlux ranlxs ranlxd gauss + +LATTICE = bcnds ftidx uidx geometry + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +UFLDS = uflds udcom shift + +SFLDS = sflds Pbnd + +LINALG = salg salg_dble liealg cmatrix_dble + +SU3FCTS = su3prod su3ren cm3x3 random_su3 + +UTILS = endian mutils utils wspace + +TCHARGE = ftcom ftensor + +SW_TERM = pauli pauli_dble swflds sw_term + +SAP = sap_com + +MODULES = $(FLAGS) $(RANDOM) $(LATTICE) $(BLOCK) $(UFLDS) $(SFLDS) \ + $(LINALG) $(SU3FCTS) $(UTILS) $(TCHARGE) $(SW_TERM) $(SAP) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/random:$(MDIR)/lattice:$(MDIR)/block:\ + $(MDIR)/uflds:$(MDIR)/sflds:$(MDIR)/su3fcts:$(MDIR)/utils:\ + $(MDIR)/linalg:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/sap + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..c41511d388837da834a544fa25919765fa24e678 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check1.c @@ -0,0 +1,525 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2005, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Consistency checks on the geometry arrays in the known block grids. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "block.h" +#include "global.h" + +static int ix_test[VOLUME+BNDRY]; + + +static void test1(blk_grid_t grid,int *bs) +{ + int ix,iy,itest; + int nb,isw,vol,*imb; + block_t *b,*bm; + + for (ix=0;ix=VOLUME)) + itest=2; + else + { + ix_test[iy]+=1; + if (ix_test[iy]>1) + itest=3; + } + } + } + + for (ix=0;ixvol)) + itest=1; + else + { + if (iy!=imb[ix]) + itest=2; + + is=(x0+x1+x2+x3+bo[0]+bo[1]+bo[2]+bo[3])%2; + + if (((is==0)&&(ix>=(vol/2)))||((is!=0)&&(ix<(vol/2)))) + itest=3; + + for (mu=0;mu<4;mu++) + { + if ((x[mu]+1)0) + { + if (imb[(*b).idn[ix][mu]]!=idn[iy][mu]) + itest=6; + } + else + { + if ((*b).idn[ix][mu]!=vol) + itest=7; + } + } + } + } + } + } + } + } + + error(itest==1,1,"test2 [check1.c]", + "b.ipt is out of range"); + error(itest==2,1,"test2 [check1.c]", + "The blocks are not properly embedded"); + error(itest==3,1,"test2 [check1.c]", + "b.ipt does not respect the even-odd ordering"); + error(itest==4,1,"test2 [check1.c]", + "b.iup is incorrect"); + error(itest==5,1,"test2 [check1.c]", + "b.iup is incorrect at the block boundary"); + error(itest==6,1,"test2 [check1.c]", + "b.idn is incorrect"); + error(itest==7,1,"test2 [check1.c]", + "b.idn is incorrect at the block boundary"); +} + + +static void test3(blk_grid_t grid) +{ + int bc,ix,iy,ie,itest; + int nbp,nall,x[4]; + int nb,isw,vol,*bs,*imb; + block_t *b,*bm; + + bc=bc_type(); + itest=0; + nall=0; + b=blk_list(grid,&nb,&isw); + bm=b+nb; + + for (;b=vol)) + itest=2; + + if (iy>0) + { + if (ix<=(*b).ibp[iy-1]) + itest=3; + } + + ix=imb[ix]; + ie=((global_time(ix)==0)&&(bc!=3)); + ie|=((global_time(ix)==(NPROC0*L0-1))&&(bc==0)); + + if (ie==0) + itest=4; + } + } + + if ((cpr[0]==0)&&(bc!=3)) + nall-=(L1*L2*L3); + if ((cpr[0]==(NPROC0-1))&&(bc==0)) + nall-=(L1*L2*L3); + + error(itest==1,1,"test3 [check1.c]", + "b.nbp is incorrect"); + error(itest==2,1,"test3 [check1.c]", + "b.ibp is out of range"); + error(itest==3,1,"test3 [check1.c]", + "b.ibp is not properly ordered"); + error(itest==4,1,"test3 [check1.c]", + "The points b.ibp are not all on the boundary of the lattice"); + error(nall!=0,1,"test3 [check1.c]", + "Incorrect total count of points at time 0 and NPROC0*L0-1"); +} + + +static void test4(blk_grid_t grid) +{ + int ix,iy,ifc,mu,ib,itest; + int nb,isw,vol,*bs,*imb; + block_t *b,*bm; + bndry_t *bb; + + for (ix=0;ix<(VOLUME+BNDRY);ix++) + ix_test[ix]=0; + + itest=0; + b=blk_list(grid,&nb,&isw); + bm=b+nb; + + for (;b=vol)) + itest=2; + + iy=(*bb).imb[ix]; + + if ((iy<0)||(iy>=(VOLUME+BNDRY))) + itest=3; + else + ix_test[iy]+=1; + } + + bb+=1; + } + } + + b=blk_list(grid,&nb,&isw); + + for (;b=(vol/2))) + itest=1; + + iy=(*bb).imb[ix]; + iz=(*bb).imb[ix+(*bb).vol/2]; + + if ((iy>=(VOLUME+(BNDRY/2)))||((iy>=(VOLUME/2))&&(iy=VOLUME)&&(iz<(VOLUME+(BNDRY/2))))) + itest=2; + } + + mu=ifc/2; + + for (ix=0;ix<(*bb).vol;ix++) + { + iy=(*bb).ipp[ix]; + + if ((((ifc%2)==0)&&((*b).idn[iy][mu]!=vol))|| + (((ifc%2)==1)&&((*b).iup[iy][mu]!=vol))) + itest=3; + + iz=(*bb).map[ix]; + + if ((ifc%2)==0) + { + for (is=1;is]"); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + geometry(); + set_sap_parms(bs,0,1,1); + set_dfl_parms(bs,2); + grid=BLK_GRIDS; + + for (igr=0;igr<(int)(BLK_GRIDS);igr++) + { + if (igr==0) + grid=SAP_BLOCKS; + else if (igr==1) + grid=DFL_BLOCKS; + else + error_root(1,1,"main [check1.c]","Unknown block grid"); + + alloc_bgr(grid); + + test1(grid,bs); + test2(grid); + test3(grid); + test4(grid); + test5(grid); + } + + error_chk(); + + if (my_rank==0) + { + printf("No errors detected\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check1.in new file mode 100644 index 0000000000000000000000000000000000000000..bd654839cac6ab535881018a1109ac0080e8af27 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check1.in @@ -0,0 +1 @@ +bs 4 4 4 4 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..c959c52188120362af34b81d223087772d049193 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check2.c @@ -0,0 +1,588 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2005, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Checks on the allocation and initialization of the gauge, Dirac and Weyl +* fields on the known block grids. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "block.h" +#include "global.h" + +typedef union +{ + su3 u; + float r[18]; +} umat_t; + +typedef union +{ + su3_dble u; + double r[18]; +} umat_dble_t; + +typedef union +{ + spinor s; + float r[24]; +} spin_t; + +typedef union +{ + spinor_dble s; + double r[24]; +} spin_dble_t; + +typedef union +{ + weyl w; + float r[12]; +} wspin_t; + +typedef union +{ + weyl_dble w; + double r[12]; +} wspin_dble_t; + + +static int check_u(int vol,su3 *u) +{ + int i; + umat_t *m,*mm; + + m=(umat_t*)(u); + mm=m+vol; + + for (;m=6)&&((*sw).u[i]!=0.0f)))) + return 1; + } + } + + return 0; +} + + +static int check_swd(int vol,pauli_dble *swd) +{ + int i; + pauli_dble *sm; + + sm=swd+vol; + + for (;swd=6)&&((*swd).u[i]!=0.0)))) + return 1; + } + } + + return 0; +} + + +static int check_s(int ns,int vol,spinor **s) +{ + int k,i; + spin_t *sp,*sm; + + for (k=0;k0) + { + if (((*b).s==NULL)||((shf&0x10)&&((*b0).s!=(*b).s))) + return 5; + if (check_s(ns,vol,(*b).s)) + return 5; + } + else + { + if ((*b).s!=NULL) + return 5; + } + + if ((*b).nsd!=nsd) + return 6; + + if (nsd>0) + { + if (((*b).sd==NULL)||((shf&0x20)&&((*b0).sd!=(*b).sd))) + return 6; + if (check_sd(nsd,vol,(*b).sd)) + return 6; + } + else + { + if ((*b).sd!=NULL) + return 6; + } + + return 0; +} + + +static int check_bnd(block_t *b0,block_t *b, + int iub,int iudb,int nw,int nwd,int shf) +{ + int vol,ifc; + bndry_t *bb0,*bb; + + bb0=(*b).bb; + bb=(*b).bb; + + for (ifc=0;ifc<8;ifc++) + { + vol=(*bb).vol; + + if (iub==1) + { + if (((*bb).u==NULL)||((shf&0x4)&&((*bb0).u!=(*bb).u))) + return 7; + if (check_u(vol,(*bb).u)) + return 7; + } + else + { + if ((*bb).u!=NULL) + return 7; + } + + if (iudb==1) + { + if (((*bb).ud==NULL)||((shf&0x8)&&((*bb0).ud!=(*bb).ud))) + return 8; + if (check_ud(vol,(*bb).ud)) + return 8; + } + else + { + if ((*bb).ud!=NULL) + return 8; + } + + if ((*bb).nw!=nw) + return 9; + + if (nw>0) + { + if (((*bb).w==NULL)||((shf&0x40)&&((*bb0).w!=(*bb).w))) + return 9; + if (check_w(nw,vol,(*bb).w)) + return 9; + } + else + { + if ((*bb).w!=NULL) + return 9; + } + + if ((*bb).nwd!=nwd) + return 10; + + if (nwd>0) + { + if (((*bb).wd==NULL)||((shf&0x80)&&((*bb0).wd!=(*bb).wd))) + return 10; + if (check_wd(nwd,vol,(*bb).wd)) + return 10; + } + else + { + if ((*bb).wd!=NULL) + return 10; + } + + bb0+=1; + bb+=1; + } + + return 0; +} + +int main(int argc,char *argv[]) +{ + int my_rank,n,n0,n1,n2,n3; + int igr,bs[4],nb,isw,itest; + int iu,iud,ns,nsd; + int iub,iudb,nw,nwd; + int shg,shu,shud,shs,shsd,shw,shwd,shf; + block_t *b0,*b; + blk_grid_t grid; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + fin=freopen("check1.in","r",stdin); + + printf("\n"); + printf("Checks on the allocation and initialization of the gauge,\n" + "Dirac and Weyl fields on the known block grids.\n"); + printf("---------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + fclose(fin); + + printf("bs = %d %d %d %d\n\n",bs[0],bs[1],bs[2],bs[3]); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + geometry(); + set_sap_parms(bs,0,1,1); + set_dfl_parms(bs,2); + grid=BLK_GRIDS; + + for (igr=0;igr<(int)(BLK_GRIDS);igr++) + { + iu=0; + iud=0; + ns=0; + nsd=0; + iub=0; + iudb=0; + nw=0; + nwd=0; + + shg=1; + shu=0; + shud=0; + shs=0; + shsd=0; + shw=0; + shwd=0; + + if (igr==0) + { + grid=SAP_BLOCKS; + + iu=1; + ns=3; + nw=1; + iub=1; + shs=1; + } + else if (igr==1) + { + grid=DFL_BLOCKS; + + iud=1; + ns=3; + nsd=3; + shud=1; + } + else + error_root(1,1,"main [check2.c]","Unknown block grid"); + + shf=0x1|(shg<<1)|(shu<<2)|(shud<<3)|(shs<<4)|(shsd<<5)|(shw<<6)|(shwd<<7); + alloc_bgr(grid); + print_grid_flags(grid); + b0=blk_list(grid,&nb,&isw); + + n0=L0/bs[0]; + n1=L1/bs[1]; + n2=L2/bs[2]; + n3=L3/bs[3]; + n=n0*cpr[0]+n1*cpr[1]+n2*cpr[2]+n3*cpr[3]; + + error((b0==NULL)||(nb!=(n0*n1*n2*n3))||(isw!=(n%2)),1, + "main [check2.c]","Incorrect return values of blk_list"); + + if (my_rank==0) + { + printf("Share flag on the blocks = %#x\n",(*b0).shf); + printf("Should be %#x\n\n",shf); + } + + itest=0; + + for (n=0;nL0)|| + ((*b).bo[1]<0)||(((*b).bo[1]+bs[1])>L1)|| + ((*b).bo[2]<0)||(((*b).bo[2]+bs[2])>L2)|| + ((*b).bo[3]<0)||(((*b).bo[3]+bs[3])>L3),1, + "main [check2.c]","b.bo is out of range"); + + error((((*b).bo[0]%bs[0])!=0)||(((*b).bo[1]%bs[1])!=0)|| + (((*b).bo[2]%bs[2])!=0)||(((*b).bo[3]%bs[3])!=0),1, + "main [check2.c]","b.bo is not an integer multiple of bs"); + + n0=(*b).bo[0]/bs[0]; + n1=(*b).bo[1]/bs[1]; + n2=(*b).bo[2]/bs[2]; + n3=(*b).bo[3]/bs[3]; + + isw=(n0+n1+n2+n3)%2; + + error(((isw==0)&&(n>=(nb/2)))||((isw==1)&&(n<(nb/2))),1, + "main [check2.c]","Blocks are not locally even-odd ordered"); + + itest=check_blk(b0,b,iu,iud,ns,nsd,shf); + if (itest!=0) + break; + + error((*b).bb==NULL,1,"main [check2.c]", + "Block boundaries are not allocated"); + + itest=check_bnd(b0,b,iub,iudb,nw,nwd,shf); + if (itest!=0) + break; + } + + error(itest==1,1,"main [check2.c]","Unexpected share flag"); + error(itest==2,1,"main [check2.c]","Geometry arrays are not shared"); + error(itest==3,1,"main [check2.c]", + "b.u or b.sw is not in the proper condition"); + error(itest==4,1,"main [check2.c]", + "b.ud or b.swd is not in the proper condition"); + error(itest==5,1,"main [check2.c]", + "b.s is not in the proper condition"); + error(itest==6,1,"main [check2.c]", + "b.sd is not in the proper condition"); + error(itest==7,1,"main [check2.c]", + "b.bb.u is not in the proper condition"); + error(itest==8,1,"main [check2.c]", + "b.bb.ud is not in the proper condition"); + error(itest==9,1,"main [check2.c]", + "b.bb.w is not in the proper condition"); + error(itest==10,1,"main [check2.c]", + "b.bb.wd is not in the proper condition"); + } + + error_chk(); + + if (my_rank==0) + { + printf("No errors detected\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..02189a97b31373c65f302c8fa3297c385b0cc8a6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check3.c @@ -0,0 +1,483 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2005, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of assign_ud2ubgr() and assign_ud2udblk(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "block.h" +#include "global.h" + +typedef union +{ + su3 u; + float r[18]; +} umat_t; + +typedef union +{ + su3_dble u; + double r[18]; +} umat_dble_t; + + +static void set_ud(void) +{ + int x0,x1,x2,x3,ix; + int y0,y1,y2,y3,ifc; + su3_dble *udb,*ud; + + random_ud(); + udb=udfld(); + + for (x0=0;x0]"); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + set_sap_parms(bs,0,1,1); + set_dfl_parms(bs,2); + alloc_bgr(SAP_BLOCKS); + alloc_bgr(DFL_BLOCKS); + + set_ud(); + assign_ud2ubgr(SAP_BLOCKS); + assign_ud2u(); + print_flags(); + print_grid_flags(SAP_BLOCKS); + + error(check_ubgr(SAP_BLOCKS),1,"main [check3.c]", + "assign_ud2ubgr() is incorrect"); + + b=blk_list(DFL_BLOCKS,&nb,&isw); + random_ud(); + assign_ud2udblk(DFL_BLOCKS,0); + set_ud(); + ie=0; + + for (n=0;n +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sw_term.h" +#include "block.h" +#include "global.h" + + +static int cmp_sw(pauli *r,pauli *s) +{ + int i; + + for (i=0;i<36;i++) + { + if ((r[0].u[i]!=s[0].u[i])||(r[1].u[i]!=s[1].u[i])) + return 1; + } + + return 0; +} + + +static int cmp_swd(pauli_dble *r,pauli_dble *s) +{ + int i; + + for (i=0;i<36;i++) + { + if ((r[0].u[i]!=s[0].u[i])||(r[1].u[i]!=s[1].u[i])) + return 1; + } + + return 0; +} + + +static int check_sw(block_t *b) +{ + int x0,x1,x2,x3,x[4]; + int y0,y1,y2,y3,ix,iy; + pauli *sw; + + sw=swfld(); + + for (x0=0;x0<(*b).bs[0];x0++) + { + for (x1=0;x1<(*b).bs[1];x1++) + { + for (x2=0;x2<(*b).bs[2];x2++) + { + for (x3=0;x3<(*b).bs[3];x3++) + { + x[0]=x0; + x[1]=x1; + x[2]=x2; + x[3]=x3; + + y0=(*b).bo[0]+x0; + y1=(*b).bo[1]+x1; + y2=(*b).bo[2]+x2; + y3=(*b).bo[3]+x3; + + ix=ipt_blk(b,x); + iy=ipt[y3+L3*y2+L2*L3*y1+L1*L2*L3*y0]; + + if (cmp_sw((*b).sw+2*ix,sw+2*iy)) + return 1; + } + } + } + } + + return 0; +} + + +static int check_swbgr(blk_grid_t grid) +{ + int nb,isw; + block_t *b,*bm; + + b=blk_list(grid,&nb,&isw); + bm=b+nb; + + for (;b]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.301,0.789,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + + set_sap_parms(bs,0,1,1); + set_dfl_parms(bs,2); + alloc_bgr(SAP_BLOCKS); + alloc_bgr(DFL_BLOCKS); + + set_sw_parms(0.05); + random_ud(); + ifail=0; + + for (iset=0;iset<(int)(PT_SETS);iset++) + { + if (iset==0) + set=ALL_PTS; + else if (iset==1) + set=EVEN_PTS; + else if (iset==2) + set=ODD_PTS; + else + set=NO_PTS; + + sw_term(NO_PTS); + ifail+=assign_swd2swbgr(SAP_BLOCKS,set); + ifail+=sw_term(set); + assign_swd2sw(); + error(check_swbgr(SAP_BLOCKS)!=0,1,"main [check4.c]", + "assign_swd2swbgr() is incorrect"); + + b=blk_list(DFL_BLOCKS,&nb,&isw); + + for (n=0;n +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "sw_term.h" +#include "block.h" +#include "global.h" + +typedef union +{ + spinor s; + float r[24]; +} spin_t; + +typedef union +{ + spinor_dble s; + double r[24]; +} spin_dble_t; + + +static int cmp_s(spinor *r,spinor *s) +{ + int i; + spin_t *rr,*rs; + + rr=(spin_t*)(r); + rs=(spin_t*)(s); + + for (i=0;i<24;i++) + { + if ((*rr).r[i]!=(*rs).r[i]) + return 1; + } + + return 0; +} + + +static int cmp_sd(spinor_dble *r,spinor_dble *s) +{ + int i; + spin_dble_t *rr,*rs; + + rr=(spin_dble_t*)(r); + rs=(spin_dble_t*)(s); + + for (i=0;i<24;i++) + { + if ((*rr).r[i]!=(*rs).r[i]) + return 1; + } + + return 0; +} + + +static int check_sb(block_t *b,ptset_t set,int k,spinor *s) +{ + int x0,x1,x2,x3,x[4]; + int y0,y1,y2,y3; + int ix,iy,is,n0,n1; + + for (x0=0;x0<(*b).bs[0];x0++) + { + for (x1=0;x1<(*b).bs[1];x1++) + { + for (x2=0;x2<(*b).bs[2];x2++) + { + for (x3=0;x3<(*b).bs[3];x3++) + { + x[0]=x0; + x[1]=x1; + x[2]=x2; + x[3]=x3; + + y0=(*b).bo[0]+x0; + y1=(*b).bo[1]+x1; + y2=(*b).bo[2]+x2; + y3=(*b).bo[3]+x3; + + ix=ipt_blk(b,x); + iy=ipt[y3+L3*y2+L2*L3*y1+L1*L2*L3*y0]; + is=(y0+y1+y2+y3)%2; + + n0=((is==0)&&((set==ALL_PTS)||(set==EVEN_PTS))); + n1=((is==1)&&((set==ALL_PTS)||(set==ODD_PTS))); + + if ((n0==1)||(n1==1)) + { + if (cmp_s((*b).s[k]+ix,s+iy)) + return 1; + } + } + } + } + } + + return 0; +} + + +static int check_sdb(block_t *b,ptset_t set,int k,spinor_dble *sd) +{ + int x0,x1,x2,x3,x[4],n0,n1; + int y0,y1,y2,y3; + int ix,iy,is; + + for (x0=0;x0<(*b).bs[0];x0++) + { + for (x1=0;x1<(*b).bs[1];x1++) + { + for (x2=0;x2<(*b).bs[2];x2++) + { + for (x3=0;x3<(*b).bs[3];x3++) + { + x[0]=x0; + x[1]=x1; + x[2]=x2; + x[3]=x3; + + y0=(*b).bo[0]+x0; + y1=(*b).bo[1]+x1; + y2=(*b).bo[2]+x2; + y3=(*b).bo[3]+x3; + + ix=ipt_blk(b,x); + iy=ipt[y3+L3*y2+L2*L3*y1+L1*L2*L3*y0]; + is=(y0+y1+y2+y3)%2; + + n0=((is==0)&&((set==ALL_PTS)||(set==EVEN_PTS))); + n1=((is==1)&&((set==ALL_PTS)||(set==ODD_PTS))); + + if ((n0==1)||(n1==1)) + { + if (cmp_sd((*b).sd[k]+ix,sd+iy)) + return 1; + } + } + } + } + } + + return 0; +} + + +static int diff_s(int vol,spinor *s,spinor *r) +{ + spinor *sm; + + sm=s+vol; + + for (;s that allows the +type of boundary condition to be chosen at runtime. When the option is not +set, open boundary conditions are assumed. + +The option may be set but has no effect in the case of check3 and check4 (the +boundary conditions are selected through the input parameter file in these +cases). diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..93081532918d5b0b1851290c14ffe0088b41ec58 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/Makefile @@ -0,0 +1,155 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 check4 + +FLAGS = flags lat_parms sap_parms dfl_parms + +LATTICE = bcnds ftidx uidx geometry + +LINALG = salg salg_dble valg valg_dble liealg cmatrix_dble cmatrix + +LINSOLV = fgcr fgcr4vd + +RANDOM = ranlux ranlxs ranlxd gauss + +UFLDS = plaq_sum shift uflds udcom + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +UTILS = endian mutils utils wspace + +SFLDS = sflds scom sdcom Pbnd Pbnd_dble + +TCHARGE = ftcom ftensor + +SW_TERM = pauli pauli_dble swflds sw_term + +DIRAC = Dw_dble Dw Dw_bnd + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +SAP = blk_solv sap_com sap sap_gcr + +ARCHIVE = archive + +DFL = dfl_geometry dfl_subspace ltl_gcr dfl_sap_gcr dfl_modes + +VFLDS = vflds vinit vcom vdcom + +LITTLE = Aw_gen Aw_com Aw_ops Aw_dble Aw ltl_modes + +MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ + $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) $(DIRAC) \ + $(BLOCK) $(SAP) $(ARCHIVE) $(DFL) $(VFLDS) $(LITTLE) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/linsolv:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:\ + $(MDIR)/sflds:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:\ + $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive:$(MDIR)/dfl:\ + $(MDIR)/vflds:$(MDIR)/little + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + +# -DFGCR_DBG -DFGCR4VD_DBG -DDFL_MODES_DBG + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..5b8244302847e4cab335c9667ffc604eb0719d24 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check1.c @@ -0,0 +1,286 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2007, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the DFL_BLOCKS grid geometry arrays. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "block.h" +#include "dfl.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,bs[4],nbs,isw; + int nb,nbb,*nbbe,*nbbo,*obbe,*obbo; + int (*inn)[8],*idx,*ipp,*map; + int ix,iy,iz,ifc,ie; + int *bo1,*bo2; + int l[4],mu,is; + double phi[2],phi_prime[2]; + block_t *b; + dfl_grid_t dfl_grid; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + fin=freopen("check1.in","r",stdin); + + printf("\n"); + printf("Check of the DFL_BLOCKS grid geometry arrays\n"); + printf("--------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + fclose(fin); + + printf("bs = %d %d %d %d\n\n",bs[0],bs[1],bs[2],bs[3]); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check1.c]", + "Syntax: check1 [-bc ]"); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + geometry(); + set_dfl_parms(bs,4); + dfl_grid=dfl_geometry(); + nb=dfl_grid.nb; + nbbe=dfl_grid.nbbe; + nbbo=dfl_grid.nbbo; + obbe=dfl_grid.obbe; + obbo=dfl_grid.obbo; + + alloc_bgr(DFL_BLOCKS); + b=blk_list(DFL_BLOCKS,&nbs,&isw); + + error((bs[0]!=(*b).bs[0])||(bs[1]!=(*b).bs[1])||(bs[2]!=(*b).bs[2])|| + (bs[3]!=(*b).bs[3])||(nb!=nbs),1,"main [check1.c]", + "Block sizes bs are incorrectly set or incorrect block number"); + + ie=0; + nbb=(nbbe[0]+nbbo[0]); + + if (obbe[0]!=0) + ie=1; + if (obbo[0]!=(obbe[7]+nbbe[7])) + ie=2; + + for (ifc=1;ifc<8;ifc++) + { + nbb+=(nbbe[ifc]+nbbo[ifc]); + + if (obbe[ifc]!=(obbe[ifc-1]+nbbe[ifc-1])) + ie=1; + if (obbo[ifc]!=(obbo[ifc-1]+nbbo[ifc-1])) + ie=2; + } + + error(nbb!=dfl_grid.nbb,1,"main [check1.c]","nbb is incorrect"); + error(ie==1,1,"main [check1.c]","Incorrect offsets obbe[ifc]"); + error(ie==2,1,"main [check1.c]","Incorrect offsets obbo[ifc]"); + + inn=dfl_grid.inn; + idx=dfl_grid.idx; + ipp=dfl_grid.ipp; + map=dfl_grid.map; + iz=0; + + for (ifc=0;ifc<8;ifc++) + { + for (ix=obbe[ifc];ix<(obbe[ifc]+nbbe[ifc]);ix++) + { + iy=ipp[ix]; + + if ((ix>obbe[ifc])&&(iy<=iz)) + ie=1; + + if (inn[iy][ifc]!=(nb+ix)) + ie=3; + + iz=iy; + } + + for (ix=obbo[ifc];ix<(obbo[ifc]+nbbo[ifc]);ix++) + { + iy=ipp[ix]; + + if ((ix>obbo[ifc])&&(iy<=iz)) + ie=2; + + if (inn[iy][ifc]!=(nb+ix)) + ie=3; + + iz=iy; + } + } + + error(ie==1,1,"main [check1.c]","Incorrect ipp at even boundary points"); + error(ie==2,1,"main [check1.c]","Incorrect ipp at odd boundary points"); + error(ie==3,1,"main [check1.c]","ipp and inn are inconsistent"); + + for (ix=0;ix0)&&(ix<(nb/2)))||(ix>(nb/2))) + { + if (idx[ix]!=idx[ix-1]+1) + ie=2; + } + + if (((ix==0)&&(isw==0))||((ix==(nb/2))&&(isw==1))) + { + bo1=b[idx[ix]].bo; + + for (mu=0;mu<4;mu++) + { + if (bo1[mu]!=0) + ie=3; + } + } + } + + error(ie==1,1,"main [check1.c]","Index array idx[ix] is not involutive"); + error(ie==2,1,"main [check1.c]","The ordering of idx[ix] is incorrect"); + error(ie==3,1,"main [check1.c]","Index of the first block is incorrect "); + + for (ix=0;ix=(nb+nbb))) + ie=1; + else + { + if (iy +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "flags.h" +#include "lattice.h" +#include "block.h" +#include "linalg.h" +#include "sflds.h" +#include "vflds.h" +#include "dfl.h" +#include "global.h" + + +static void check_basis(int Ns,double *dev0,double *dev1) +{ + int nb,isw,i,j; + double dev,x[2],y[2]; + complex_dble z; + block_t *b,*bm; + + b=blk_list(DFL_BLOCKS,&nb,&isw); + bm=b+nb; + + x[0]=0.0; + x[1]=0.0; + + for (;bx[0]) + x[0]=dev; + } + + assign_s2sd((*b).vol,(*b).s[i],(*b).sd[0]); + mulr_spinor_add_dble((*b).vol,(*b).sd[0],(*b).sd[i],-1.0); + dev=norm_square_dble((*b).vol,0,(*b).sd[0]); + dev=sqrt(dev); + + if (dev>x[1]) + x[1]=dev; + } + } + + if (NPROC>1) + { + MPI_Reduce(x,y,2,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(y,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + (*dev0)=y[0]; + (*dev1)=y[1]; + } + else + { + (*dev0)=x[0]; + (*dev1)=x[1]; + } +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,i; + int bs[4],Ns,nv; + double phi[2],phi_prime[2]; + double dev,dev0,dev1; + complex **vm,**wv,z; + complex_dble **wvd; + spinor **ws; + spinor_dble **wsd; + FILE *fin=NULL,*flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + fin=freopen("check2.in","r",stdin); + + printf("\n"); + printf("Check of the programs in the module dfl_subspace.c\n"); + printf("--------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + read_line("Ns","%d",&Ns); + fclose(fin); + + printf("bs = %d %d %d %d\n",bs[0],bs[1],bs[2],bs[3]); + printf("Ns = %d\n\n",Ns); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check2.c]", + "Syntax: check2 [-bc ]"); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + set_dfl_parms(bs,Ns); + + alloc_ws(Ns+1); + alloc_wsd(1); + alloc_wv(2); + alloc_wvd(2); + + ws=reserve_ws(Ns+1); + wsd=reserve_wsd(1); + vm=vflds()+Ns; + wv=reserve_wv(2); + wvd=reserve_wvd(2); + nv=Ns*VOLUME/(bs[0]*bs[1]*bs[2]*bs[3]); + + for (i=0;idev0) + dev0=dev; + + assign_s2s(VOLUME,ws[i],ws[Ns]); + dfl_sub_v2s(vm[i],ws[Ns]); + dev=(double)(norm_square(VOLUME,1,ws[Ns])/ + norm_square(VOLUME,1,ws[i])); + if (dev>dev1) + dev1=dev; + } + + if (my_rank==0) + { + printf("Check of the single-precision vector modes:\n"); + printf("Using dfl_v2s: %.1e\n",sqrt(dev0)); + printf("Using dfl_sub_v2s: %.1e\n\n",sqrt(dev1)); + } + + random_v(nv,wv[0],1.0f); + random_vd(nv,wvd[0],1.0); + + dfl_v2s(wv[0],ws[Ns]); + dfl_s2v(ws[Ns],wv[1]); + z.re=-1.0f; + z.im=0.0f; + mulc_vadd(nv,wv[0],wv[1],z); + dev0=(double)(vnorm_square(nv,1,wv[0])/ + vnorm_square(nv,1,wv[1])); + + dfl_vd2sd(wvd[0],wsd[0]); + dfl_sd2vd(wsd[0],wvd[1]); + diff_vd2v(nv,wvd[0],wvd[1],wv[0]); + assign_vd2v(nv,wvd[1],wv[1]); + dev1=(double)(vnorm_square(nv,1,wv[0])/ + vnorm_square(nv,1,wv[1])); + + dfl_sub_vd2sd(wvd[1],wsd[0]); + dev=norm_square_dble(VOLUME,1,wsd[0])/vnorm_square_dble(nv,1,wvd[1]); + if (dev>dev1) + dev1=dev; + + if (my_rank==0) + { + printf("Check of\n"); + printf("dfl_s2v,..: %.1e\n",sqrt(dev0)); + printf("dfl_sd2vd,..: %.1e\n\n",sqrt(dev1)); + } + + error_chk(); + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check2.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check2.in new file mode 100644 index 0000000000000000000000000000000000000000..20253b46f4d0a08e55b1fbbf01d1a7f70685d112 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check2.in @@ -0,0 +1,2 @@ +bs 8 4 4 4 +Ns 4 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..89185a0df62c1335f9b8bd394a26b22149946cd8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check3.c @@ -0,0 +1,317 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the solver for the little Dirac equation. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "archive.h" +#include "uflds.h" +#include "sw_term.h" +#include "sflds.h" +#include "vflds.h" +#include "linalg.h" +#include "dirac.h" +#include "sap.h" +#include "little.h" +#include "dfl.h" +#include "global.h" + +int my_rank,id,first,last,step; +int bs[4],Ns,nkv,nmx,eoflg,bc; +double kappa,csw,mu,cF,cF_prime; +double phi[2],phi_prime[2],m0,res; +char cnfg_dir[NAME_SIZE],cnfg_file[NAME_SIZE],nbase[NAME_SIZE]; + + +static void new_subspace(void) +{ + int nb,isw,ifail; + int n,nmax,k,l; + spinor **mds,**ws; + sap_parms_t sp; + + blk_list(SAP_BLOCKS,&nb,&isw); + + if (nb==0) + alloc_bgr(SAP_BLOCKS); + + assign_ud2ubgr(SAP_BLOCKS); + sw_term(NO_PTS); + ifail=assign_swd2swbgr(SAP_BLOCKS,ODD_PTS); + + error(ifail!=0,1,"new_subspace [check3.c]", + "Inversion of the SW term was not safe"); + + sp=sap_parms(); + nmax=6; + mds=reserve_ws(Ns); + ws=reserve_ws(1); + + for (k=0;k %sn%d in steps of %d\n\n", + nbase,first,nbase,last,step); + fflush(flog); + } + + error_root(((last-first)%step)!=0,1,"main [check3.c]", + "last-first is not a multiple of step"); + check_dir_root(cnfg_dir); + + nsize=name_size("%s/%sn%d",cnfg_dir,nbase,last); + error_root(nsize>=NAME_SIZE,1,"main [check3.c]", + "cnfg_dir name is too long"); + + for (icnfg=first;icnfg<=last;icnfg+=step) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + import_cnfg(cnfg_file); + chs_ubnd(-1); + + if (my_rank==0) + { + printf("Configuration no %d\n",icnfg); + fflush(flog); + } + + new_subspace(); + random_vd(nv,wvd[0],1.0); + nrm=sqrt(vnorm_square_dble(nv,1,wvd[0])); + assign_vd2vd(nv,wvd[0],wvd[2]); + set_Awhat(mu); + + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + rho=ltl_gcr(nkv,nmx,res,mu,wvd[0],wvd[1],&status); + + MPI_Barrier(MPI_COMM_WORLD); + wt2=MPI_Wtime(); + wdt=wt2-wt1; + + error_chk(); + z.re=-1.0; + z.im=0.0; + mulc_vadd_dble(nv,wvd[2],wvd[0],z); + del=vnorm_square_dble(nv,1,wvd[2]); + error_root(del!=0.0,1,"main [check3.c]", + "Source field is not preserved"); + + set_Aw(mu); + set_Awhat(mu); + Aw_dble(wvd[1],wvd[2]); + mulc_vadd_dble(nv,wvd[2],wvd[0],z); + Aweeinv_dble(wvd[2],wvd[3]); + assign_vd2vd(nv/2,wvd[3],wvd[2]); + del=sqrt(vnorm_square_dble(nv,1,wvd[2])); + + if (my_rank==0) + { + printf("status = %d\n",status); + printf("rho = %.2e, res = %.2e\n",rho,res); + printf("check = %.2e, check = %.2e\n",del,del/nrm); + printf("time = %.2e sec (total)\n",wdt); + if (status>0) + printf(" = %.2e usec (per point and GCR iteration)", + (1.0e6*wdt)/((double)(status)*(double)(VOLUME))); + printf("\n\n"); + fflush(flog); + } + + ltl_gcr(nkv,nmx,res,mu,wvd[0],wvd[0],&status); + mulc_vadd_dble(nv,wvd[0],wvd[1],z); + del=vnorm_square_dble(nv,1,wvd[0]); + error_root(del!=0.0,1,"main [check3.c]", + "Incorrect result when the input and output fields coincide"); + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check3.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check3.in new file mode 100644 index 0000000000000000000000000000000000000000..d4e586de5590793ec90ccc044b3b5c6b57d9dadd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check3.in @@ -0,0 +1,29 @@ + +[Configurations] +name 16x8x8x8b6.00id2 +cnfg_dir /home/data/openQCD/cnfg +first 7 +last 7 +step 1 + +[Lattice parameters] +kappa 0.1280 +csw 1.2 +mu 0.0123 +eoflg 1 + +[Boundary conditions] +type 0 +#phi 0.12 -0.56 +#phi' 0.92 0.76 +cF 0.95 +#cF' 0.90 + +[DFL] +bs 4 4 4 4 +Ns 4 + +[GCR] +nkv 16 +nmx 48 +res 1.0e-13 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check4.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check4.c new file mode 100644 index 0000000000000000000000000000000000000000..4b3829a5a6e1f7bcd3248b9fb61125108dd66679 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check4.c @@ -0,0 +1,339 @@ + +/******************************************************************************* +* +* File check4.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check and performance of the deflated SAP+GCR solver. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "archive.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "dirac.h" +#include "dfl.h" +#include "global.h" + +int my_rank,id,first,last,step; +int bs_sap[4],nmr_sap,ncy_sap,nkv_gcr,nmx_gcr; +int bs_dfl[4],Ns,nkv_dfl,nmx_dfl,nkv_dpr,nmx_dpr,eoflg,bc; +int ninv_dgn,nmr_dgn,ncy_dgn; +double kappa,csw,mu,cF,cF_prime; +double phi[2],phi_prime[2],m0,res_gcr,res_dpr; +double kappa_dgn,mu_dgn; +char cnfg_dir[NAME_SIZE],cnfg_file[NAME_SIZE],nbase[NAME_SIZE]; + + +int main(int argc,char *argv[]) +{ + int nsize,icnfg,ncnfg; + int status[2],avgstat[2]; + double rho,nrm,del,resm; + double wt1,wt2,wdt,wta; + spinor_dble **psd; + lat_parms_t lat; + tm_parms_t tm; + dfl_pro_parms_t dpr; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check4.log","w",stdout); + fin=freopen("check4.in","r",stdin); + + printf("\n"); + printf("Check and performance of the deflated SAP+GCR solver\n"); + printf("----------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + find_section("Configurations"); + read_line("name","%s",nbase); + read_line("cnfg_dir","%s",cnfg_dir); + read_line("first","%d",&first); + read_line("last","%d",&last); + read_line("step","%d",&step); + + find_section("Lattice parameters"); + read_line("kappa","%lf",&kappa); + read_line("csw","%lf",&csw); + read_line("mu","%lf",&mu); + read_line("eoflg","%d",&eoflg); + + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + cF=1.0; + cF_prime=1.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + if (bc!=3) + read_line("cF","%lf",&cF); + + if (bc==2) + read_line("cF'","%lf",&cF_prime); + else + cF_prime=cF; + + find_section("SAP"); + read_line("bs","%d %d %d %d",bs_sap,bs_sap+1,bs_sap+2,bs_sap+3); + read_line("nmr","%d",&nmr_sap); + read_line("ncy","%d",&ncy_sap); + + find_section("Deflation subspace"); + read_line("bs","%d %d %d %d",bs_dfl,bs_dfl+1,bs_dfl+2,bs_dfl+3); + read_line("Ns","%d",&Ns); + + find_section("Deflation subspace generation"); + read_line("kappa","%lf",&kappa_dgn); + read_line("mu","%lf",&mu_dgn); + read_line("ninv","%d",&ninv_dgn); + read_line("nmr","%d",&nmr_dgn); + read_line("ncy","%d",&ncy_dgn); + + find_section("Deflation projection"); + read_line("nkv","%d",&nkv_dpr); + read_line("nmx","%d",&nmx_dpr); + read_line("res","%lf",&res_dpr); + + find_section("GCR"); + read_line("nkv","%d",&nkv_gcr); + read_line("nmx","%d",&nmx_gcr); + read_line("res","%lf",&res_gcr); + + fclose(fin); + } + + MPI_Bcast(nbase,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(cnfg_dir,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(&first,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&last,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&step,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&csw,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&eoflg,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + MPI_Bcast(bs_sap,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr_sap,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy_sap,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(bs_dfl,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&kappa_dgn,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu_dgn,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&ninv_dgn,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr_dgn,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy_dgn,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&nkv_dpr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx_dpr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res_dpr,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + MPI_Bcast(&nkv_gcr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx_gcr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res_gcr,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + lat=set_lat_parms(5.5,1.0,1,&kappa,csw); + print_lat_parms(); + + set_bc_parms(bc,1.0,1.0,cF,cF_prime,phi,phi_prime); + print_bc_parms(); + + set_sap_parms(bs_sap,1,nmr_sap,ncy_sap); + m0=lat.m0[0]; + set_sw_parms(m0); + tm=set_tm_parms(eoflg); + set_dfl_parms(bs_dfl,Ns); + dpr=set_dfl_pro_parms(nkv_dpr,nmx_dpr,res_dpr); + set_dfl_gen_parms(kappa_dgn,mu_dgn,ninv_dgn,nmr_dgn,ncy_dgn); + + if (my_rank==0) + { + printf("mu = %.6f\n",mu); + printf("eoflg = %d\n\n",tm.eoflg); + } + + print_sap_parms(1); + print_dfl_parms(0); + + if (my_rank==0) + { + printf("GCR parameters:\n"); + printf("nkv = %d\n",nkv_gcr); + printf("nmx = %d\n",nmx_gcr); + printf("res = %.2e\n\n",res_gcr); + + printf("Configurations %sn%d -> %sn%d in steps of %d\n\n", + nbase,first,nbase,last,step); + fflush(flog); + } + + start_ranlux(0,1234); + geometry(); + + if (Ns<=(2*nkv_gcr)) + alloc_ws(2*nkv_gcr+2); + else + alloc_ws(Ns+2); + alloc_wsd(6); + alloc_wv(2*dpr.nkv+2); + alloc_wvd(4); + psd=reserve_wsd(3); + + error_root(((last-first)%step)!=0,1,"main [check4.c]", + "last-first is not a multiple of step"); + + nsize=name_size("%s/%sn%d",cnfg_dir,nbase,last); + error_root(nsize>=NAME_SIZE,1,"main [check4.c]", + "cnfg_dir name is too long"); + + ncnfg=0; + avgstat[0]=0; + avgstat[1]=0; + resm=0.0; + wta=0.0; + + for (icnfg=first;icnfg<=last;icnfg+=step) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + import_cnfg(cnfg_file); + chs_ubnd(-1); + + if (my_rank==0) + { + printf("Configuration no %d\n",icnfg); + fflush(flog); + } + + dfl_modes(status); + error_root(status[0]<0,1,"main [check4.c]", + "Subspace generation failed"); + random_sd(VOLUME,psd[0],1.0); + bnd_sd2zero(ALL_PTS,psd[0]); + nrm=sqrt(norm_square_dble(VOLUME,1,psd[0])); + assign_sd2sd(VOLUME,psd[0],psd[2]); + + rho=dfl_sap_gcr(nkv_gcr,nmx_gcr,res_gcr,mu,psd[0],psd[1],status); + + error_chk(); + mulr_spinor_add_dble(VOLUME,psd[2],psd[0],-1.0); + del=norm_square_dble(VOLUME,1,psd[2]); + error_root(del!=0.0,1,"main [check4.c]", + "Source field is not preserved"); + + Dw_dble(mu,psd[1],psd[2]); + mulr_spinor_add_dble(VOLUME,psd[2],psd[0],-1.0); + del=sqrt(norm_square_dble(VOLUME,1,psd[2])); + + if (my_rank==0) + { + printf("status = %d,%d\n",status[0],status[1]); + printf("rho = %.2e, res = %.2e\n",rho,res_gcr); + printf("check = %.2e, check = %.2e\n",del,del/nrm); + fflush(flog); + } + + if ((status[0]>=0)&&(status[1]>=0)) + { + ncnfg+=1; + avgstat[0]+=status[0]; + avgstat[1]+=status[1]; + del/=nrm; + + if (del>resm) + resm=del; + + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + rho=dfl_sap_gcr(nkv_gcr,nmx_gcr,res_gcr,mu,psd[0],psd[0],status); + + MPI_Barrier(MPI_COMM_WORLD); + wt2=MPI_Wtime(); + wdt=wt2-wt1; + wta+=wdt; + + if (my_rank==0) + { + printf("time = %.2e sec (w/o preparatory steps)\n",wdt); + if (status[0]>0) + printf(" = %.2e usec (per point and GCR iteration)", + (1.0e6*wdt)/((double)(status[0])*(double)(VOLUME))); + printf("\n\n"); + fflush(flog); + } + + mulr_spinor_add_dble(VOLUME,psd[0],psd[1],-1.0); + del=norm_square_dble(VOLUME,1,psd[0]); + error_root(del!=0.0,1,"main [check4.c]","Incorrect result when " + "the input and output fields coincide"); + } + } + + if (my_rank==0) + { + printf("Summary of results\n"); + printf("------------------\n\n"); + + printf("Processed %d configurations\n",ncnfg); + printf("Solver failed in %d cases\n",(last-first)/step+1-ncnfg); + printf("Maximal relative residue = %.1e\n",resm); + + status[0]=(avgstat[0]+ncnfg/2)/ncnfg; + status[1]=(avgstat[1]+ncnfg/2)/ncnfg; + wta/=(double)(ncnfg); + + printf("Average status = %d,%d\n",status[0],status[1]); + printf("Average time = %.2e sec (w/o preparatory steps)\n",wta); + if (status[0]>0) + printf(" = %.2e usec (per point and GCR iteration)", + (1.0e6*wta)/((double)(status[0])*(double)(VOLUME))); + printf("\n\n"); + + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check4.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check4.in new file mode 100644 index 0000000000000000000000000000000000000000..25bee13da74ae390e47019e1b064520f4f04953b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check4.in @@ -0,0 +1,46 @@ + +[Configurations] +name 16x8x8x8b6.00id2 +cnfg_dir /home/data/openQCD/cnfg +first 7 +last 7 +step 1 + +[Lattice parameters] +kappa 0.1280 +csw 1.2 +mu 0.0123 +eoflg 1 + +[Boundary conditions] +type 0 +#phi 0.12 -0.56 +#phi' 0.92 0.76 +cF 0.95 +#cF' 0.90 + +[SAP] +bs 4 4 4 4 +nmr 4 +ncy 5 + +[Deflation subspace] +bs 4 4 4 4 +Ns 8 + +[Deflation subspace generation] +kappa 0.1350 +mu 0.01 +ninv 5 +nmr 4 +ncy 5 + +[Deflation projection] +nkv 16 +nmx 64 +res 1.0e-2 + +[GCR] +nkv 16 +nmx 48 +res 1.0e-10 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..7dad660954b1127c9cafc91bd6971a0e8e952b6f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/INDEX @@ -0,0 +1,34 @@ + +Programs for the O(a)-improved Wilson-Dirac operator. + +check1 Gauge covariance of Dw(). + +check2 Action of Dw() on plane waves. + +check3 Hermiticity of Dw() and comparison with + Dwee(),..,Dwhat(). + +check4 Gauge covariance of Dw_dble(). + +check5 Action of Dw_dble() on plane waves. + +check6 Hermiticity of Dw_dble() and comparison with + Dwee_dble(),..,Dwhat_dble(). + +check7 Comparison of Dw_blk() with Dw(). + +check8 Comparison of Dw_blk_dble() with Dw_dble(). + +check9 Comparison of Dw_bnd() with Dw(). + +time1 Timing of Dw() and Dwhat(). + +time2 Timing of Dw_dble() and Dwhat_dble(). + +time3 Timing of Dw_blk() and Dwhat_blk(). + +time4 Timing of Dw_blk_dble() and Dwhat_blk_dble(). + +The programs check1,..,time4 accept the option -bc that allows the +type of boundary condition to be chosen at runtime. When the option is not +set, open boundary conditions are assumed. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..480067a1ff7a9c0f6af992156130bfcb7e13c1fe --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/Makefile @@ -0,0 +1,143 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 check4 check5 check6 check7 check8 check9 \ + time1 time2 time3 time4 + +FLAGS = flags lat_parms sap_parms dfl_parms + +LATTICE = bcnds ftidx uidx geometry + +LINALG = salg salg_dble liealg cmatrix_dble + +RANDOM = ranlux ranlxs ranlxd gauss + +UFLDS = plaq_sum shift uflds udcom + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +UTILS = endian mutils utils wspace + +SFLDS = sflds scom sdcom Pbnd Pbnd_dble + +TCHARGE = ftcom ftensor + +SW_TERM = pauli pauli_dble swflds sw_term + +DIRAC = Dw_dble Dw Dw_bnd + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +SAP = sap_com + +MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(RANDOM) $(UFLDS) \ + $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) \ + $(DIRAC) $(BLOCK) $(SAP) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/random:\ + $(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:$(MDIR)/sflds:\ + $(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:$(MDIR)/block:\ + $(MDIR)/sap + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..cc8e00111892a04fc1955d984182c2b9ee325c7c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check1.c @@ -0,0 +1,400 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2005, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Gauge covariance of Dw(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +static int bc,nfc[8],ofs[8]; +static const su3_dble ud0={{0.0}}; +static su3_dble *g,*gbuf; +static su3_dble wd ALIGNED16; + + +static void pack_gbuf(void) +{ + int ifc,ib,ix; + + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=0; + ofs[1]=ofs[0]+nfc[0]; + ofs[2]=ofs[1]+nfc[1]; + ofs[3]=ofs[2]+nfc[2]; + ofs[4]=ofs[3]+nfc[3]; + ofs[5]=ofs[4]+nfc[4]; + ofs[6]=ofs[5]+nfc[5]; + ofs[7]=ofs[6]+nfc[6]; + + for (ifc=0;ifc<8;ifc++) + { + for (ib=0;ib0) + { + tag=mpi_tag(); + saddr=npr[ifc^0x1]; + raddr=npr[ifc]; + sbuf=gbuf+ofs[ifc]; + rbuf=g+VOLUME+ofs[ifc]; + + if (np&0x1) + { + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + } + } +} + + +static void random_g(void) +{ + int ix,t; + su3_dble unity,*gx; + + unity=ud0; + unity.c11.re=1.0; + unity.c22.re=1.0; + unity.c33.re=1.0; + gx=g; + + for (ix=0;ix0)||(bc!=1)) + random_su3_dble(gx); + else + (*gx)=unity; + + gx+=1; + } + + if (BNDRY>0) + { + pack_gbuf(); + send_gbuf(); + } +} + + +static void transform_ud(void) +{ + int ix,iy,t,ifc; + su3_dble *u; + + u=udfld(); + + for (ix=(VOLUME/2);ix]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_wsd(5); + alloc_ws(5); + ps=reserve_ws(5); + psd=reserve_wsd(5); + + g=amalloc(NSPIN*sizeof(*g),4); + if (BNDRY!=0) + gbuf=amalloc((BNDRY/2)*sizeof(*gbuf),4); + + error((g==NULL)||((BNDRY!=0)&&(gbuf==NULL)),1,"main [check1.c]", + "Unable to allocate auxiliary arrays"); + + swp=set_sw_parms(-0.0123); + mu=0.0376; + + if (my_rank==0) + printf("m0 = %.4e, csw = %.4e, cF = %.4e, cF' = %.4e\n\n", + swp.m0,swp.csw,swp.cF[0],swp.cF[1]); + + random_g(); + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + + assign_ud2u(); + assign_swd2sw(); + + for (i=0;i<5;i++) + { + random_sd(NSPIN,psd[i],1.0); + assign_sd2s(NSPIN,psd[i],ps[i]); + } + + assign_s2s(VOLUME,ps[0],ps[4]); + bnd_s2zero(ALL_PTS,ps[4]); + Dw(mu,ps[0],ps[1]); + mulr_spinor_add(VOLUME,ps[4],ps[0],-1.0f); + d=norm_square(VOLUME,1,ps[4]); + error(d!=0.0f,1,"main [check1.c]","Dw() changes the input field"); + + Dw(mu,ps[0],ps[4]); + mulr_spinor_add(VOLUME,ps[4],ps[1],-1.0f); + d=norm_square(VOLUME,1,ps[4]); + error(d!=0.0f,1,"main [check1.c]","Action of Dw() depends " + "on the boundary values of the input field"); + + assign_s2s(VOLUME,ps[1],ps[4]); + bnd_s2zero(ALL_PTS,ps[4]); + mulr_spinor_add(VOLUME,ps[4],ps[1],-1.0f); + d=norm_square(VOLUME,1,ps[4]); + error(d!=0.0f,1,"main [check1.c]", + "Dw() does not preserve the zero boundary values"); + + transform_ud(); + transform_sd(psd[0],psd[2]); + sw_term(NO_PTS); + + assign_ud2u(); + assign_swd2sw(); + assign_sd2s(VOLUME,psd[2],ps[2]); + + Dw(mu,ps[2],ps[3]); + assign_s2sd(VOLUME,ps[1],psd[1]); + transform_sd(psd[1],psd[2]); + assign_sd2s(VOLUME,psd[2],ps[2]); + + mulr_spinor_add(VOLUME,ps[3],ps[2],-1.0f); + d=norm_square(VOLUME,1,ps[3])/norm_square(VOLUME,1,ps[0]); + error_chk(); + + if (my_rank==0) + { + printf("Normalized difference = %.2e\n",sqrt((double)(d))); + printf("(should be less than 1*10^(-6) or so)\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..8e13e38e1c691e2a421f8aea886df38a746a231b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check2.c @@ -0,0 +1,345 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2005, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Action of Dw() on plane waves. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + +static spinor rs ALIGNED16; +static const spinor sd0={{{0.0}}}; + + +static su3_vector mul_cplx(complex z,su3_vector s) +{ + su3_vector r; + + r.c1.re=z.re*s.c1.re-z.im*s.c1.im; + r.c1.im=z.im*s.c1.re+z.re*s.c1.im; + r.c2.re=z.re*s.c2.re-z.im*s.c2.im; + r.c2.im=z.im*s.c2.re+z.re*s.c2.im; + r.c3.re=z.re*s.c3.re-z.im*s.c3.im; + r.c3.im=z.im*s.c3.re+z.re*s.c3.im; + + return r; +} + + +static spinor mul_gamma(int mu,spinor s) +{ + spinor r; + complex i,m_i,m_1; + + i.re=0.0f; + i.im=1.0f; + + m_i.re=0.0f; + m_i.im=-1.0f; + + m_1.re=-1.0f; + m_1.im=0.0f; + + if (mu==0) + { + r.c1=mul_cplx(m_1,s.c3); + r.c2=mul_cplx(m_1,s.c4); + r.c3=mul_cplx(m_1,s.c1); + r.c4=mul_cplx(m_1,s.c2); + } + else if (mu==1) + { + r.c1=mul_cplx(m_i,s.c4); + r.c2=mul_cplx(m_i,s.c3); + r.c3=mul_cplx(i,s.c2); + r.c4=mul_cplx(i,s.c1); + } + else if (mu==2) + { + r.c1=mul_cplx(m_1,s.c4); + r.c2=s.c3; + r.c3=s.c2; + r.c4=mul_cplx(m_1,s.c1); + } + else if (mu==3) + { + r.c1=mul_cplx(m_i,s.c3); + r.c2=mul_cplx(i,s.c4); + r.c3=mul_cplx(i,s.c1); + r.c4=mul_cplx(m_i,s.c2); + } + else + { + r.c1=s.c1; + r.c2=s.c2; + r.c3=mul_cplx(m_1,s.c3); + r.c4=mul_cplx(m_1,s.c4); + } + + return r; +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc; + int n,i,ix,nu,x0,x1,x2,x3; + int np[4],bo[4]; + float ran[4]; + float mu,pi,d,dmax; + float mp,pt,pv,p[4],sp[4]; + double phi[2],phi_prime[2]; + complex z; + spinor **ps,s0,s1,s2,s3,s4; + sw_parms_t swp; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + printf("\n"); + printf("Action of Dw() on plane waves\n"); + printf("-----------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + printf("For this test to pass, the calculated differences delta\n"); + printf("should be at most 1*10^(-5) or so\n\n"); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check2.c]", + "Syntax: check2 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_ws(3); + ps=reserve_ws(3); + + swp=set_sw_parms(-0.0123); + mu=0.0876f; + + if (my_rank==0) + printf("m0 = %.4e, csw = %.4e, cF = %.4e, cF' = %.4e\n\n", + swp.m0,swp.csw,swp.cF[0],swp.cF[1]); + + (void)udfld(); + chs_ubnd(-1); + sw_term(NO_PTS); + assign_ud2u(); + assign_swd2sw(); + pi=(float)(4.0*atan(1.0)); + n=10; + bo[0]=cpr[0]*L0; + bo[1]=cpr[1]*L1; + bo[2]=cpr[2]*L2; + bo[3]=cpr[3]*L3; + dmax=0.0f; + + for (i=0;idmax) + dmax=d; + + if (my_rank==0) + printf("Normalized deviation = %.1e at p=(%d,%d,%d,%d)\n", + d,np[0],np[1],np[2],np[3]); + } + + error_chk(); + + if (my_rank==0) + { + printf("\n"); + printf("Maximal normalized deviation = %.1e\n\n",dmax); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..06e819aad7b435cc113b89b764fb0426ef1d019a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check3.c @@ -0,0 +1,301 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2005, 2008, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Hermiticity of Dw() and comparison with Dwee(),..,Dwhat(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,i; + float mu,d; + double phi[2],phi_prime[2]; + complex z1,z2; + spinor **ps; + sw_parms_t swp; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check3.log","w",stdout); + printf("\n"); + printf("Hermiticity of Dw() and comparison with Dwee(),..,Dwhat()\n"); + printf("---------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + printf("For this test to pass, the calculated differences\n"); + printf("should be at most 1*10^(-5) or so\n\n"); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check1.c]", + "Syntax: check1 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_ws(5); + ps=reserve_ws(5); + + swp=set_sw_parms(-0.0123); + mu=0.0376; + + if (my_rank==0) + printf("m0 = %.4e, csw = %.4e, cF = %.4e, cF' = %.4e\n\n", + swp.m0,swp.csw,swp.cF[0],swp.cF[1]); + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + assign_ud2u(); + assign_swd2sw(); + + for (i=0;i<4;i++) + random_s(NSPIN,ps[i],1.0f); + + Dw(mu,ps[0],ps[2]); + mulg5(VOLUME,ps[2]); + Dw(-mu,ps[1],ps[3]); + mulg5(VOLUME,ps[3]); + + z1=spinor_prod(VOLUME,1,ps[0],ps[3]); + z2=spinor_prod(VOLUME,1,ps[2],ps[1]); + + d=(float)(sqrt((double)((z1.re-z2.re)*(z1.re-z2.re)+ + (z1.im-z2.im)*(z1.im-z2.im)))); + d/=(float)(sqrt((double)(12*NPROC)*(double)(VOLUME))); + error_chk(); + + if (my_rank==0) + printf("Deviation from gamma5-Hermiticity = %.1e\n",d); + + for (i=0;i<4;i++) + random_s(NSPIN,ps[i],1.0f); + + assign_s2s(VOLUME,ps[0],ps[1]); + assign_s2s(VOLUME,ps[2],ps[3]); + Dwee(mu,ps[1],ps[2]); + + bnd_s2zero(EVEN_PTS,ps[0]); + mulr_spinor_add(VOLUME,ps[1],ps[0],-1.0f); + d=norm_square(VOLUME,1,ps[1]); + + error(d!=0.0f,1,"main [check3.c]", + "Dwee() changes the input field in unexpected ways"); + + mulr_spinor_add(VOLUME/2,ps[2]+(VOLUME/2),ps[3]+(VOLUME/2),-1.0f); + assign_s2s(VOLUME/2,ps[2],ps[4]); + bnd_s2zero(EVEN_PTS,ps[4]); + mulr_spinor_add(VOLUME/2,ps[2],ps[4],-1.0f); + d=norm_square(VOLUME,1,ps[2]); + + error(d!=0.0f,1,"main [check3.c]", + "Dwee() changes the output field where it should not"); + + for (i=0;i<4;i++) + random_s(NSPIN,ps[i],1.0f); + + assign_s2s(VOLUME,ps[0],ps[1]); + assign_s2s(VOLUME,ps[2],ps[3]); + Dwoo(mu,ps[1],ps[2]); + + bnd_s2zero(ODD_PTS,ps[0]); + mulr_spinor_add(VOLUME,ps[1],ps[0],-1.0f); + d=norm_square(VOLUME,1,ps[1]); + + error(d!=0.0f,1,"main [check3.c]", + "Dwoo() changes the input field in unexpected ways"); + + mulr_spinor_add(VOLUME/2,ps[2],ps[3],-1.0f); + assign_s2s(VOLUME/2,ps[2]+(VOLUME/2),ps[4]+(VOLUME/2)); + bnd_s2zero(ODD_PTS,ps[4]); + mulr_spinor_add(VOLUME/2,ps[2]+(VOLUME/2),ps[4]+(VOLUME/2),-1.0f); + d=norm_square(VOLUME,1,ps[2]); + + error(d!=0.0f,1,"main [check3.c]", + "Dwoo() changes the output field where it should not"); + + for (i=0;i<4;i++) + random_s(NSPIN,ps[i],1.0f); + + assign_s2s(VOLUME,ps[0],ps[1]); + assign_s2s(VOLUME,ps[2],ps[3]); + Dwoe(ps[1],ps[2]); + + bnd_s2zero(EVEN_PTS,ps[0]); + mulr_spinor_add(VOLUME,ps[1],ps[0],-1.0f); + d=norm_square(VOLUME,1,ps[1]); + + error(d!=0.0f,1,"main [check3.c]", + "Dwoe() changes the input field in unexpected ways"); + + mulr_spinor_add(VOLUME/2,ps[2],ps[3],-1.0f); + assign_s2s(VOLUME/2,ps[2]+(VOLUME/2),ps[4]+(VOLUME/2)); + bnd_s2zero(ODD_PTS,ps[4]); + mulr_spinor_add(VOLUME/2,ps[2]+(VOLUME/2),ps[4]+(VOLUME/2),-1.0f); + d=norm_square(VOLUME,1,ps[2]); + + error(d!=0.0f,1,"main [check3.c]", + "Dwoe() changes the output field where it should not"); + + for (i=0;i<4;i++) + random_s(NSPIN,ps[i],1.0f); + + assign_s2s(VOLUME,ps[0],ps[1]); + assign_s2s(VOLUME,ps[2],ps[3]); + Dweo(ps[1],ps[2]); + + bnd_s2zero(ODD_PTS,ps[0]); + mulr_spinor_add(VOLUME,ps[1],ps[0],-1.0f); + d=norm_square(VOLUME,1,ps[1]); + + error(d!=0.0f,1,"main [check3.c]", + "Dweo() changes the input field in unexpected ways"); + + mulr_spinor_add(VOLUME/2,ps[2]+(VOLUME/2),ps[3]+(VOLUME/2),-1.0f); + assign_s2s(VOLUME/2,ps[2],ps[4]); + bnd_s2zero(EVEN_PTS,ps[4]); + mulr_spinor_add(VOLUME/2,ps[2],ps[4],-1.0f); + d=norm_square(VOLUME,1,ps[2]); + + error(d!=0.0f,1,"main [check3.c]", + "Dweo() changes the output field where it should not"); + + for (i=0;i<4;i++) + random_s(NSPIN,ps[i],1.0f); + + assign_s2s(VOLUME,ps[0],ps[1]); + assign_s2s(VOLUME,ps[2],ps[3]); + Dwhat(mu,ps[1],ps[2]); + + bnd_s2zero(EVEN_PTS,ps[0]); + mulr_spinor_add(VOLUME,ps[1],ps[0],-1.0f); + d=norm_square(VOLUME,1,ps[1]); + + error(d!=0.0f,1,"main [check3.c]", + "Dwhat() changes the input field in unexpected ways"); + + mulr_spinor_add(VOLUME/2,ps[2]+(VOLUME/2),ps[3]+(VOLUME/2),-1.0f); + assign_s2s(VOLUME/2,ps[2],ps[4]); + bnd_s2zero(EVEN_PTS,ps[4]); + mulr_spinor_add(VOLUME/2,ps[2],ps[4],-1.0f); + d=norm_square(VOLUME,1,ps[2]); + + error(d!=0.0f,1,"main [check3.c]", + "Dwhat() changes the output field where it should not"); + + for (i=0;i<4;i++) + random_s(NSPIN,ps[i],1.0f); + + assign_s2s(VOLUME,ps[0],ps[2]); + Dw(mu,ps[0],ps[1]); + Dwee(mu,ps[2],ps[3]); + set_s2zero(VOLUME/2,ps[0]); + mulr_spinor_add(VOLUME/2,ps[0],ps[3],-1.0f); + Dweo(ps[2],ps[0]); + set_s2zero(VOLUME/2,ps[3]); + mulr_spinor_add(VOLUME/2,ps[3],ps[0],-1.0f); + + Dwoo(mu,ps[2],ps[3]); + Dwoe(ps[2],ps[4]); + mulr_spinor_add(VOLUME/2,ps[3]+(VOLUME/2),ps[4]+(VOLUME/2),1.0f); + mulr_spinor_add(VOLUME,ps[3],ps[1],-1.0f); + d=norm_square(VOLUME,1,ps[3])/norm_square(VOLUME,1,ps[1]); + d=(float)(sqrt((double)(d))); + + if (my_rank==0) + printf("Deviation of Dw() from Dwee(),.. = %.1e\n",d); + + for (i=0;i<4;i++) + random_s(NSPIN,ps[i],1.0f); + + assign_s2s(NSPIN,ps[0],ps[1]); + Dwhat(mu,ps[0],ps[2]); + + Dwoe(ps[1],ps[1]); + Dwee(mu,ps[1],ps[1]); + Dwoo(0.0,ps[1],ps[1]); + Dweo(ps[1],ps[1]); + + mulr_spinor_add(VOLUME/2,ps[1],ps[2],-1.0f); + d=norm_square(VOLUME/2,1,ps[1])/norm_square(VOLUME/2,1,ps[2]); + d=(float)(sqrt((double)(d))); + + if (my_rank==0) + printf("Deviation of Dwhat() from Dwee(),.. = %.1e\n",d); + + for (i=0;i<4;i++) + random_s(NSPIN,ps[i],1.0f); + + assign_s2s(VOLUME,ps[0],ps[2]); + + set_tm_parms(1); + Dw(mu,ps[0],ps[1]); + set_tm_parms(0); + + Dwee(mu,ps[2],ps[3]); + mulr_spinor_add(VOLUME/2,ps[1],ps[3],-1.0f); + Dweo(ps[2],ps[1]); + Dwoe(ps[2],ps[3]); + mulr_spinor_add(VOLUME/2,ps[1]+(VOLUME/2),ps[3]+(VOLUME/2),-1.0f); + Dwoo(0.0f,ps[2],ps[3]); + mulr_spinor_add(VOLUME/2,ps[1]+(VOLUME/2),ps[3]+(VOLUME/2),-1.0f); + d=norm_square(VOLUME,1,ps[1])/norm_square(VOLUME,1,ps[2]); + d=(float)(sqrt((double)(d))); + + error_chk(); + + if (my_rank==0) + { + printf("Check of Dw()|eoflg=1 = %.1e\n\n",d); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check4.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check4.c new file mode 100644 index 0000000000000000000000000000000000000000..d6901bad026190ddfe041b6d7846af9cc683b2e2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check4.c @@ -0,0 +1,384 @@ + +/******************************************************************************* +* +* File check4.c +* +* Copyright (C) 2005, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Gauge covariance of Dw_dble(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +static int bc,nfc[8],ofs[8]; +static const su3_dble ud0={{0.0}}; +static su3_dble *g,*gbuf; +static su3_dble wd ALIGNED16; + + +static void pack_gbuf(void) +{ + int ifc,ib,ix; + + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=0; + ofs[1]=ofs[0]+nfc[0]; + ofs[2]=ofs[1]+nfc[1]; + ofs[3]=ofs[2]+nfc[2]; + ofs[4]=ofs[3]+nfc[3]; + ofs[5]=ofs[4]+nfc[4]; + ofs[6]=ofs[5]+nfc[5]; + ofs[7]=ofs[6]+nfc[6]; + + for (ifc=0;ifc<8;ifc++) + { + for (ib=0;ib0) + { + tag=mpi_tag(); + saddr=npr[ifc^0x1]; + raddr=npr[ifc]; + sbuf=gbuf+ofs[ifc]; + rbuf=g+VOLUME+ofs[ifc]; + + if (np&0x1) + { + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + } + } +} + + +static void random_g(void) +{ + int ix,t; + su3_dble unity,*gx; + + unity=ud0; + unity.c11.re=1.0; + unity.c22.re=1.0; + unity.c33.re=1.0; + gx=g; + + for (ix=0;ix0)||(bc!=1)) + random_su3_dble(gx); + else + (*gx)=unity; + + gx+=1; + } + + if (BNDRY>0) + { + pack_gbuf(); + send_gbuf(); + } +} + + +static void transform_ud(void) +{ + int ix,iy,t,ifc; + su3_dble *u; + + u=udfld(); + + for (ix=(VOLUME/2);ix]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_wsd(5); + psd=reserve_wsd(5); + + g=amalloc(NSPIN*sizeof(*g),4); + if (BNDRY!=0) + gbuf=amalloc((BNDRY/2)*sizeof(*gbuf),4); + + error((g==NULL)||((BNDRY!=0)&&(gbuf==NULL)),1,"main [check4.c]", + "Unable to allocate auxiliary arrays"); + + swp=set_sw_parms(-0.0123); + mu=0.0376; + + if (my_rank==0) + printf("m0 = %.4e, csw = %.4e, cF = %.4e, cF' = %.4e\n\n", + swp.m0,swp.csw,swp.cF[0],swp.cF[1]); + + random_g(); + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + + for (i=0;i<4;i++) + random_sd(NSPIN,psd[i],1.0); + + assign_sd2sd(VOLUME,psd[0],psd[4]); + bnd_sd2zero(ALL_PTS,psd[4]); + Dw_dble(mu,psd[0],psd[1]); + mulr_spinor_add_dble(VOLUME,psd[4],psd[0],-1.0); + d=norm_square_dble(VOLUME,1,psd[4]); + error(d!=0.0,1,"main [check4.c]","Dw_dble() changes the input field"); + + Dw_dble(mu,psd[0],psd[4]); + mulr_spinor_add_dble(VOLUME,psd[4],psd[1],-1.0); + d=norm_square_dble(VOLUME,1,psd[4]); + error(d!=0.0,1,"main [check4.c]","Action of Dw_dble() depends " + "on the boundary values of the input field"); + + assign_sd2sd(VOLUME,psd[1],psd[4]); + bnd_sd2zero(ALL_PTS,psd[4]); + mulr_spinor_add_dble(VOLUME,psd[4],psd[1],-1.0); + d=norm_square_dble(VOLUME,1,psd[4]); + error(d!=0.0,1,"main [check4.c]", + "Dw_dble() does not vanish at global time 0 and NPROC0*L0-1 "); + + transform_sd(psd[0],psd[2]); + transform_ud(); + sw_term(NO_PTS); + Dw_dble(mu,psd[2],psd[3]); + transform_sd(psd[1],psd[2]); + + mulr_spinor_add_dble(VOLUME,psd[3],psd[2],-1.0); + d=norm_square_dble(VOLUME,1,psd[3])/norm_square_dble(VOLUME,1,psd[0]); + error_chk(); + + if (my_rank==0) + { + printf("Normalized difference = %.2e\n",sqrt(d)); + printf("(should be around 1*10^(-15) or so)\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check5.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check5.c new file mode 100644 index 0000000000000000000000000000000000000000..63e86da45c91e3565834cbf97d54967ad6093110 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check5.c @@ -0,0 +1,343 @@ + +/******************************************************************************* +* +* File check5.c +* +* Copyright (C) 2005, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Action of Dw_dble() on plane waves. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + +static spinor_dble rs ALIGNED16; +static const spinor_dble sd0={{{0.0}}}; + + +static su3_vector_dble mul_cplx(complex_dble z,su3_vector_dble s) +{ + su3_vector_dble r; + + r.c1.re=z.re*s.c1.re-z.im*s.c1.im; + r.c1.im=z.im*s.c1.re+z.re*s.c1.im; + r.c2.re=z.re*s.c2.re-z.im*s.c2.im; + r.c2.im=z.im*s.c2.re+z.re*s.c2.im; + r.c3.re=z.re*s.c3.re-z.im*s.c3.im; + r.c3.im=z.im*s.c3.re+z.re*s.c3.im; + + return r; +} + + +static spinor_dble mul_gamma(int mu,spinor_dble s) +{ + spinor_dble r; + complex_dble i,m_i,m_1; + + i.re=0.0; + i.im=1.0; + + m_i.re=0.0; + m_i.im=-1.0; + + m_1.re=-1.0; + m_1.im=0.0; + + if (mu==0) + { + r.c1=mul_cplx(m_1,s.c3); + r.c2=mul_cplx(m_1,s.c4); + r.c3=mul_cplx(m_1,s.c1); + r.c4=mul_cplx(m_1,s.c2); + } + else if (mu==1) + { + r.c1=mul_cplx(m_i,s.c4); + r.c2=mul_cplx(m_i,s.c3); + r.c3=mul_cplx(i,s.c2); + r.c4=mul_cplx(i,s.c1); + } + else if (mu==2) + { + r.c1=mul_cplx(m_1,s.c4); + r.c2=s.c3; + r.c3=s.c2; + r.c4=mul_cplx(m_1,s.c1); + } + else if (mu==3) + { + r.c1=mul_cplx(m_i,s.c3); + r.c2=mul_cplx(i,s.c4); + r.c3=mul_cplx(i,s.c1); + r.c4=mul_cplx(m_i,s.c2); + } + else + { + r.c1=s.c1; + r.c2=s.c2; + r.c3=mul_cplx(m_1,s.c3); + r.c4=mul_cplx(m_1,s.c4); + } + + return r; +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc; + int n,i,ix,nu,x0,x1,x2,x3; + int np[4],bo[4]; + float ran[4]; + double phi[2],phi_prime[2]; + double mu,pi,d,dmax; + double mp,pt,pv,p[4],sp[4]; + complex_dble z; + spinor_dble **psd,s0,s1,s2,s3,s4; + sw_parms_t swp; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check5.log","w",stdout); + printf("\n"); + printf("Action of Dw_dble() on plane waves\n"); + printf("----------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + printf("For this test to pass, the calculated differences delta\n"); + printf("should be at most 1*10^(-14) or so\n\n"); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check5.c]", + "Syntax: check5 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_wsd(3); + psd=reserve_wsd(3); + + swp=set_sw_parms(-0.0123); + mu=0.0876; + + if (my_rank==0) + printf("m0 = %.4e, csw = %.4e, cF = %.4e, cF' = %.4e\n\n", + swp.m0,swp.csw,swp.cF[0],swp.cF[1]); + + (void)udfld(); + chs_ubnd(-1); + sw_term(NO_PTS); + pi=4.0*atan(1.0); + n=10; + bo[0]=cpr[0]*L0; + bo[1]=cpr[1]*L1; + bo[2]=cpr[2]*L2; + bo[3]=cpr[3]*L3; + dmax=0.0; + + for (i=0;idmax) + dmax=d; + + if (my_rank==0) + printf("Normalized deviation = %.1e at p=(%d,%d,%d,%d)\n", + d,np[0],np[1],np[2],np[3]); + } + + error_chk(); + + if (my_rank==0) + { + printf("\n"); + printf("Maximal normalized deviation = %.1e\n\n",dmax); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check6.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check6.c new file mode 100644 index 0000000000000000000000000000000000000000..86b1fdb1bd8285bf1a581c655d630b78f52a8b17 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check6.c @@ -0,0 +1,302 @@ + +/******************************************************************************* +* +* File check6.c +* +* Copyright (C) 2005, 2008, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Hermiticity of Dw_dble() and comparison with Dwee_dble(),..,Dwhat_dble(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,i; + double phi[2],phi_prime[2]; + double mu,d; + complex_dble z1,z2; + spinor_dble **psd; + sw_parms_t swp; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check6.log","w",stdout); + printf("\n"); + printf("Hermiticity of Dw_dble() and comparison with Dwee_dble(),..," + "Dwhat_dble()\n"); + printf("------------------------------------------------------------" + "------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + printf("For this test to pass, the calculated differences\n"); + printf("should be at most 1*10^(-15) or so\n\n"); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check6.c]", + "Syntax: check6 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_wsd(5); + psd=reserve_wsd(5); + + swp=set_sw_parms(-0.0123); + mu=0.0376; + + if (my_rank==0) + printf("m0 = %.4e, csw = %.4e, cF = %.4e, cF' = %.4e\n\n", + swp.m0,swp.csw,swp.cF[0],swp.cF[1]); + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + + for (i=0;i<4;i++) + random_sd(NSPIN,psd[i],1.0); + + Dw_dble(mu,psd[0],psd[2]); + mulg5_dble(VOLUME,psd[2]); + Dw_dble(-mu,psd[1],psd[3]); + mulg5_dble(VOLUME,psd[3]); + + z1=spinor_prod_dble(VOLUME,1,psd[0],psd[3]); + z2=spinor_prod_dble(VOLUME,1,psd[2],psd[1]); + + d=sqrt((z1.re-z2.re)*(z1.re-z2.re)+ + (z1.im-z2.im)*(z1.im-z2.im)); + d/=sqrt((double)(12*NPROC)*(double)(VOLUME)); + error_chk(); + + if (my_rank==0) + printf("Deviation from gamma5-Hermiticity = %.1e\n",d); + + for (i=0;i<4;i++) + random_sd(NSPIN,psd[i],1.0); + + assign_sd2sd(VOLUME,psd[0],psd[1]); + assign_sd2sd(VOLUME,psd[2],psd[3]); + Dwee_dble(mu,psd[1],psd[2]); + + bnd_sd2zero(EVEN_PTS,psd[0]); + mulr_spinor_add_dble(VOLUME,psd[1],psd[0],-1.0); + d=norm_square_dble(VOLUME,1,psd[1]); + + error(d!=0.0,1,"main [check6.c]", + "Dwee_dble() changes the input field in unexpected ways"); + + mulr_spinor_add_dble(VOLUME/2,psd[2]+(VOLUME/2),psd[3]+(VOLUME/2),-1.0); + assign_sd2sd(VOLUME/2,psd[2],psd[4]); + bnd_sd2zero(EVEN_PTS,psd[4]); + mulr_spinor_add_dble(VOLUME/2,psd[2],psd[4],-1.0); + d=norm_square_dble(VOLUME,1,psd[2]); + + error(d!=0.0,1,"main [check6.c]", + "Dwee_dble() changes the output field where it should not"); + + for (i=0;i<4;i++) + random_sd(NSPIN,psd[i],1.0); + + assign_sd2sd(VOLUME,psd[0],psd[1]); + assign_sd2sd(VOLUME,psd[2],psd[3]); + Dwoo_dble(mu,psd[1],psd[2]); + + bnd_sd2zero(ODD_PTS,psd[0]); + mulr_spinor_add_dble(VOLUME,psd[1],psd[0],-1.0); + d=norm_square_dble(VOLUME,1,psd[1]); + + error(d!=0.0,1,"main [check6.c]", + "Dwoo_dble() changes the input field in unexpected ways"); + + mulr_spinor_add_dble(VOLUME/2,psd[2],psd[3],-1.0); + assign_sd2sd(VOLUME/2,psd[2]+(VOLUME/2),psd[4]+(VOLUME/2)); + bnd_sd2zero(ODD_PTS,psd[4]); + mulr_spinor_add_dble(VOLUME/2,psd[2]+(VOLUME/2),psd[4]+(VOLUME/2),-1.0); + d=norm_square_dble(VOLUME,1,psd[2]); + + error(d!=0.0,1,"main [check6.c]", + "Dwoo_dble() changes the output field where it should not"); + + for (i=0;i<4;i++) + random_sd(NSPIN,psd[i],1.0); + + assign_sd2sd(VOLUME,psd[0],psd[1]); + assign_sd2sd(VOLUME,psd[2],psd[3]); + Dwoe_dble(psd[1],psd[2]); + + bnd_sd2zero(EVEN_PTS,psd[0]); + mulr_spinor_add_dble(VOLUME,psd[1],psd[0],-1.0); + d=norm_square_dble(VOLUME,1,psd[1]); + + error(d!=0.0,1,"main [check6.c]", + "Dwoe_dble() changes the input field in unexpected ways"); + + mulr_spinor_add_dble(VOLUME/2,psd[2],psd[3],-1.0); + assign_sd2sd(VOLUME/2,psd[2]+(VOLUME/2),psd[4]+(VOLUME/2)); + bnd_sd2zero(ODD_PTS,psd[4]); + mulr_spinor_add_dble(VOLUME/2,psd[2]+(VOLUME/2),psd[4]+(VOLUME/2),-1.0); + d=norm_square_dble(VOLUME,1,psd[2]); + + error(d!=0.0,1,"main [check6.c]", + "Dwoe_dble() changes the output field where it should not"); + + for (i=0;i<4;i++) + random_sd(NSPIN,psd[i],1.0); + + assign_sd2sd(VOLUME,psd[0],psd[1]); + assign_sd2sd(VOLUME,psd[2],psd[3]); + Dweo_dble(psd[1],psd[2]); + + bnd_sd2zero(ODD_PTS,psd[0]); + mulr_spinor_add_dble(VOLUME,psd[1],psd[0],-1.0); + d=norm_square_dble(VOLUME,1,psd[1]); + + error(d!=0.0,1,"main [check6.c]", + "Dweo_dble() changes the input field in unexpected ways"); + + mulr_spinor_add_dble(VOLUME/2,psd[2]+(VOLUME/2),psd[3]+(VOLUME/2),-1.0); + assign_sd2sd(VOLUME/2,psd[2],psd[4]); + bnd_sd2zero(EVEN_PTS,psd[4]); + mulr_spinor_add_dble(VOLUME/2,psd[2],psd[4],-1.0); + d=norm_square_dble(VOLUME,1,psd[2]); + + error(d!=0.0,1,"main [check6.c]", + "Dweo_dble() changes the output field where it should not"); + + for (i=0;i<4;i++) + random_sd(NSPIN,psd[i],1.0); + + assign_sd2sd(VOLUME,psd[0],psd[1]); + assign_sd2sd(VOLUME,psd[2],psd[3]); + Dwhat_dble(mu,psd[1],psd[2]); + + bnd_sd2zero(EVEN_PTS,psd[0]); + mulr_spinor_add_dble(VOLUME,psd[1],psd[0],-1.0); + d=norm_square_dble(VOLUME,1,psd[1]); + + error(d!=0.0,1,"main [check6.c]", + "Dwhat_dble() changes the input field in unexpected ways"); + + mulr_spinor_add_dble(VOLUME/2,psd[2]+(VOLUME/2),psd[3]+(VOLUME/2),-1.0); + assign_sd2sd(VOLUME/2,psd[2],psd[4]); + bnd_sd2zero(EVEN_PTS,psd[4]); + mulr_spinor_add_dble(VOLUME/2,psd[2],psd[4],-1.0); + d=norm_square_dble(VOLUME,1,psd[2]); + + error(d!=0.0,1,"main [check6.c]", + "Dwhat_dble() changes the output field where it should not"); + + for (i=0;i<4;i++) + random_sd(NSPIN,psd[i],1.0); + + assign_sd2sd(VOLUME,psd[0],psd[2]); + Dw_dble(mu,psd[0],psd[1]); + Dwee_dble(mu,psd[2],psd[3]); + set_sd2zero(VOLUME/2,psd[0]); + mulr_spinor_add_dble(VOLUME/2,psd[0],psd[3],-1.0); + Dweo_dble(psd[2],psd[0]); + set_sd2zero(VOLUME/2,psd[3]); + mulr_spinor_add_dble(VOLUME/2,psd[3],psd[0],-1.0); + + Dwoo_dble(mu,psd[2],psd[3]); + Dwoe_dble(psd[2],psd[4]); + mulr_spinor_add_dble(VOLUME/2,psd[3]+(VOLUME/2),psd[4]+(VOLUME/2),1.0); + + mulr_spinor_add_dble(VOLUME,psd[3],psd[1],-1.0); + d=norm_square_dble(VOLUME,1,psd[3])/norm_square_dble(VOLUME,1,psd[1]); + d=sqrt(d); + + if (my_rank==0) + printf("Deviation of Dw_dble() from Dwee_dble(),.. = %.1e\n",d); + + for (i=0;i<4;i++) + random_sd(NSPIN,psd[i],1.0); + + assign_sd2sd(NSPIN,psd[0],psd[1]); + Dwhat_dble(mu,psd[0],psd[2]); + + Dwoe_dble(psd[1],psd[1]); + Dwee_dble(mu,psd[1],psd[1]); + Dwoo_dble(0.0,psd[1],psd[1]); + Dweo_dble(psd[1],psd[1]); + + mulr_spinor_add_dble(VOLUME/2,psd[1],psd[2],-1.0); + d=norm_square_dble(VOLUME/2,1,psd[1])/norm_square_dble(VOLUME/2,1,psd[2]); + d=sqrt(d); + + if (my_rank==0) + printf("Deviation of Dwhat_dble() from Dwee_dble(),.. = %.1e\n",d); + + for (i=0;i<4;i++) + random_sd(NSPIN,psd[i],1.0); + + assign_sd2sd(VOLUME,psd[0],psd[2]); + + set_tm_parms(1); + Dw_dble(mu,psd[0],psd[1]); + set_tm_parms(0); + + Dwee_dble(mu,psd[2],psd[3]); + mulr_spinor_add_dble(VOLUME/2,psd[1],psd[3],-1.0); + Dweo_dble(psd[2],psd[1]); + Dwoe_dble(psd[2],psd[3]); + mulr_spinor_add_dble(VOLUME/2,psd[1]+(VOLUME/2),psd[3]+(VOLUME/2),-1.0); + Dwoo_dble(0.0,psd[2],psd[3]); + mulr_spinor_add_dble(VOLUME/2,psd[1]+(VOLUME/2),psd[3]+(VOLUME/2),-1.0); + d=norm_square_dble(VOLUME,1,psd[1])/norm_square_dble(VOLUME,1,psd[2]); + d=sqrt(d); + + error_chk(); + + if (my_rank==0) + { + printf("Check of Dw_dble()|eoflg=1 = %.1e\n\n",d); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check7.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check7.c new file mode 100644 index 0000000000000000000000000000000000000000..2ad98b8f43793f63f13288b9554d208e4be0709a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check7.c @@ -0,0 +1,429 @@ + +/******************************************************************************* +* +* File check7.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Comparison of Dw_blk(),..,Dwhat_blk() with Dw(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "block.h" +#include "dirac.h" +#include "global.h" + + +static void blk_s2zero(int ic,spinor *s) +{ + int nb,isw; + int nbh,n,nm,vol; + block_t *b; + + b=blk_list(SAP_BLOCKS,&nb,&isw); + nbh=nb/2; + vol=(*b).vol; + + if (ic^isw) + n=nbh; + else + n=0; + + nm=n+nbh; + + for (;n]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + set_sap_parms(bs,0,1,1); + alloc_bgr(SAP_BLOCKS); + alloc_ws(4); + + swp=set_sw_parms(0.05); + mu=0.123f; + + if (my_rank==0) + printf("m0 = %.4e, mu = %.4e, csw = %.4e, cF = %.4e, cF' = %.4e\n\n", + swp.m0,mu,swp.csw,swp.cF[0],swp.cF[1]); + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + + assign_ud2u(); + assign_swd2sw(); + assign_ud2ubgr(SAP_BLOCKS); + assign_swd2swbgr(SAP_BLOCKS,NO_PTS); + + ps=reserve_ws(4); + b=blk_list(SAP_BLOCKS,&nb,&isw); + nbh=nb/2; + vol=(*b).vol; + volh=vol/2; + + for (itm=0;itm<2;itm++) + { + ie=0; + dmax=0.0f; + set_tm_parms(itm); + + if (my_rank==0) + printf("Twisted-mass flag = %d\n",itm); + + for (ic=0;ic<2;ic++) + { + random_s(VOLUME,ps[0],1.0f); + random_s(VOLUME,ps[2],1.0f); + blk_s2zero(ic^0x1,ps[0]); + blk_s2zero(ic^0x1,ps[2]); + + if (ic^isw) + n=nbh; + else + n=0; + + nm=n+nbh; + + for (;ndmax) + dmax=d; + } + + error_chk(); + error(ie,1,"main [check7.c]", + "Dw_blk() changes the fields where it should not"); + + dmax=(float)(sqrt((double)(dmax))); + + if (my_rank==0) + { + printf("The maximal relative deviations are:\n\n"); + printf("Dw_blk(): %.1e\n",dmax); + } + + dmax=0.0f; + random_s(VOLUME,ps[0],1.0f); + random_s(VOLUME,ps[1],1.0f); + + for (n=0;ndmax) + dmax=d; + + random_s(VOLUME,ps[0],1.0f); + random_s(VOLUME,ps[1],1.0f); + + for (n=0;ndmax) + dmax=d; + + error_chk(); + error(ie,1,"main [check7.c]", + "Dwee_blk() or Dwoo_blk() changes the fields where it should not"); + + dmax=(float)(sqrt((double)(dmax))); + + if (my_rank==0) + printf("Dwee_blk(), Dwoo_blk(): %.1e\n",dmax); + + dmax=0.0f; + + for (ic=0;ic<2;ic++) + { + random_s(VOLUME,ps[0],1.0f); + random_s(VOLUME,ps[1],1.0f); + random_s(VOLUME,ps[2],1.0f); + blk_s2zero(ic^0x1,ps[0]); + blk_s2zero(ic^0x1,ps[2]); + + if (ic^isw) + n=nbh; + else + n=0; + + nm=n+nbh; + + for (;ndmax) + dmax=d; + } + + error_chk(); + error(ie,1,"main [check7.c]", + "Dweo_blk() changes the fields where it should not"); + + dmax=(float)(sqrt((double)(dmax))); + + if (my_rank==0) + printf("Dweo_blk(): %.1e\n",dmax); + + dmax=0.0f; + + for (ic=0;ic<2;ic++) + { + random_s(VOLUME,ps[0],1.0f); + random_s(VOLUME,ps[1],1.0f); + random_s(VOLUME,ps[2],1.0f); + blk_s2zero(ic^0x1,ps[0]); + blk_s2zero(ic^0x1,ps[2]); + + if (ic^isw) + n=nbh; + else + n=0; + + nm=n+nbh; + + for (;ndmax) + dmax=d; + } + + error_chk(); + error(ie,1,"main [check7.c]", + "Dwoe_blk() changes the fields where it should not"); + + dmax=(float)(sqrt((double)(dmax))); + + if (my_rank==0) + printf("Dwoe_blk(): %.1e\n",dmax); + + dmax=0.0f; + random_s(VOLUME,ps[0],1.0f); + random_s(VOLUME,ps[1],1.0f); + + for (n=0;ndmax) + dmax=d; + + assign_s2sblk(SAP_BLOCKS,n,ALL_PTS,ps[0],0); + mulr_spinor_add(volh,b[n].s[0]+volh,b[n].s[1]+volh,-1.0f); + if (norm_square(volh,0,b[n].s[0]+volh)!=0.0f) + ie=1; + } + + error_chk(); + error(ie,1,"main [check7.c]", + "Dwhat_blk() changes the fields where it should not"); + + dmax=(float)(sqrt((double)(dmax))); + + if (NPROC>1) + { + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_FLOAT,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_FLOAT,0,MPI_COMM_WORLD); + } + + if (my_rank==0) + printf("Dwhat_blk(): %.1e\n\n",dmax); + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check7.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check7.in new file mode 100644 index 0000000000000000000000000000000000000000..bd654839cac6ab535881018a1109ac0080e8af27 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check7.in @@ -0,0 +1 @@ +bs 4 4 4 4 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check8.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check8.c new file mode 100644 index 0000000000000000000000000000000000000000..c946bc7e322993b91925d948ce44ae89417ccdf3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check8.c @@ -0,0 +1,444 @@ + +/******************************************************************************* +* +* File check8.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Comparison of Dw_blk_dble(),..,Dwhat_blk_dble with Dw_dble(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "block.h" +#include "dirac.h" +#include "global.h" + + +static void blk_sd2zero(int ic,spinor_dble *sd) +{ + int nb,isw; + int nbh,n,nm,vol; + block_t *b; + + b=blk_list(DFL_BLOCKS,&nb,&isw); + nbh=nb/2; + vol=(*b).vol; + + if (ic^isw) + n=nbh; + else + n=0; + + nm=n+nbh; + + for (;n]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + set_dfl_parms(bs,2); + alloc_bgr(DFL_BLOCKS); + alloc_wsd(4); + + swp=set_sw_parms(0.05); + mu=0.123; + + if (my_rank==0) + printf("m0 = %.4e, mu = %.4e, csw = %.4e, cF = %.4e, cF' = %.4e\n\n", + swp.m0,mu,swp.csw,swp.cF[0],swp.cF[1]); + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + + psd=reserve_wsd(4); + b=blk_list(DFL_BLOCKS,&nb,&isw); + nbh=nb/2; + vol=(*b).vol; + volh=vol/2; + + for (itm=0;itm<2;itm++) + { + ie=0; + dmax=0.0; + set_tm_parms(itm); + + if (my_rank==0) + printf("Twisted-mass flag = %d\n",itm); + + for (ic=0;ic<2;ic++) + { + random_sd(VOLUME,psd[0],1.0); + random_sd(VOLUME,psd[2],1.0); + blk_sd2zero(ic^0x1,psd[0]); + blk_sd2zero(ic^0x1,psd[2]); + + if (ic^isw) + n=nbh; + else + n=0; + + nm=n+nbh; + + for (;ndmax) + dmax=d; + } + + error_chk(); + error(ie,1,"main [check8.c]", + "Dw_blk_dble() changes the fields where it should not"); + + dmax=sqrt(dmax); + + if (my_rank==0) + { + printf("The maximal relative deviations are:\n\n"); + printf("Dw_blk_dble(): %.1e\n",dmax); + } + + dmax=0.0; + random_sd(VOLUME,psd[0],1.0); + random_sd(VOLUME,psd[1],1.0); + + for (n=0;ndmax) + dmax=d; + + random_sd(VOLUME,psd[0],1.0); + random_sd(VOLUME,psd[1],1.0); + + for (n=0;ndmax) + dmax=d; + + error_chk(); + error(ie,1,"main [check8.c]","Dwee_blk_dble() or Dwoo_blk_dble() " + "changes the fields where it should not"); + + dmax=sqrt(dmax); + + if (my_rank==0) + printf("Dwee_blk_dble(), Dwoo_blk_dble(): %.1e\n",dmax); + + dmax=0.0; + + for (ic=0;ic<2;ic++) + { + random_sd(VOLUME,psd[0],1.0); + random_sd(VOLUME,psd[1],1.0); + random_sd(VOLUME,psd[2],1.0); + blk_sd2zero(ic^0x1,psd[0]); + blk_sd2zero(ic^0x1,psd[2]); + + if (ic^isw) + n=nbh; + else + n=0; + + nm=n+nbh; + + for (;ndmax) + dmax=d; + } + + error_chk(); + error(ie,1,"main [check8.c]", + "Dweo_blk_dble() changes the fields where it should not"); + + dmax=sqrt(dmax); + + if (my_rank==0) + printf("Dweo_blk_dble(): %.1e\n",dmax); + + dmax=0.0; + + for (ic=0;ic<2;ic++) + { + random_sd(VOLUME,psd[0],1.0); + random_sd(VOLUME,psd[1],1.0); + random_sd(VOLUME,psd[2],1.0); + blk_sd2zero(ic^0x1,psd[0]); + blk_sd2zero(ic^0x1,psd[2]); + + if (ic^isw) + n=nbh; + else + n=0; + + nm=n+nbh; + + for (;ndmax) + dmax=d; + } + + error_chk(); + error(ie,1,"main [check8.c]", + "Dwoe_blk_dble() changes the fields where it should not"); + + dmax=sqrt(dmax); + + if (my_rank==0) + printf("Dwoe_blk_dble(): %.1e\n",dmax); + + dmax=0.0; + random_sd(VOLUME,psd[0],1.0); + random_sd(VOLUME,psd[1],1.0); + + for (n=0;ndmax) + dmax=d; + + assign_sd2sdblk(DFL_BLOCKS,n,ALL_PTS,psd[0],0); + mulr_spinor_add_dble(volh,b[n].sd[0]+volh,b[n].sd[1]+volh,-1.0); + if (norm_square_dble(volh,0,b[n].sd[0]+volh)!=0.0) + ie=1; + } + + error_chk(); + error(ie,1,"main [check8.c]", + "Dwhat_blk_dble() changes the fields where it should not"); + + dmax=sqrt(dmax); + + if (NPROC>1) + { + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + if (my_rank==0) + printf("Dwhat_blk_dble(): %.1e\n\n",dmax); + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check9.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check9.c new file mode 100644 index 0000000000000000000000000000000000000000..6836c34b8afb12d4350b27815f4ccfbf8d6b0cb0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check9.c @@ -0,0 +1,260 @@ + +/******************************************************************************* +* +* File check9.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Comparison of Dw_bnd() with Dw(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "block.h" +#include "sap.h" +#include "dirac.h" +#include "global.h" + +typedef union +{ + weyl w; + float r[12]; +} spin_t; + + +static void blk_s2zero(int ic,spinor *s) +{ + int nb,isw; + int nbh,n,nm,vol; + block_t *b; + + b=blk_list(SAP_BLOCKS,&nb,&isw); + nbh=nb/2; + vol=(*b).vol; + + if (ic^isw) + n=nbh; + else + n=0; + + nm=n+nbh; + + for (;n]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + set_sap_parms(bs,0,1,1); + alloc_bgr(SAP_BLOCKS); + alloc_ws(4); + + swp=set_sw_parms(0.05); + mu=0.123f; + + if (my_rank==0) + printf("m0 = %.4e, mu = %.4e, csw = %.4e, cF = %.4e, cF' = %.4e\n\n", + swp.m0,mu,swp.csw,swp.cF[0],swp.cF[1]); + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + + assign_ud2u(); + assign_swd2sw(); + assign_ud2ubgr(SAP_BLOCKS); + + ps=reserve_ws(4); + b=blk_list(SAP_BLOCKS,&nb,&isw); + nbh=nb/2; + vol=(*b).vol; + + ie=0; + dmax=0.0f; + + for (ic=0;ic<2;ic++) + { + random_s(VOLUME,ps[0],1.0f); + assign_s2s(VOLUME,ps[0],ps[3]); + + if (ic^isw) + n=nbh; + else + n=0; + nm=n+nbh; + + for (;ndmax) + dmax=d; + + if (ic^isw) + n=nbh; + else + n=0; + nm=n+nbh; + + for (;n +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,count,nt; + int i,nflds; + float mu; + double phi[2],phi_prime[2]; + double wt1,wt2,wdt; + spinor **ps; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time1.log","w",stdout); + + printf("\n"); + printf("Timing of Dw() and Dwhat()\n"); + printf("--------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + if (NPROC>1) + printf("There are %d MPI processes\n",NPROC); + else + printf("There is 1 MPI process\n"); + + if ((VOLUME*sizeof(float))<(64*1024)) + { + printf("The local size of the gauge field is %d KB\n", + (int)((72*VOLUME*sizeof(float))/(1024))); + printf("The local size of a quark field is %d KB\n", + (int)((24*VOLUME*sizeof(float))/(1024))); + } + else + { + printf("The local size of the gauge field is %d MB\n", + (int)((72*VOLUME*sizeof(float))/(1024*1024))); + printf("The local size of a quark field is %d MB\n", + (int)((24*VOLUME*sizeof(float))/(1024*1024))); + } + +#if (defined x64) +#if (defined AVX) + printf("Using AVX instructions\n"); +#else + printf("Using SSE3 instructions and 16 xmm registers\n"); +#endif +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + printf("\n"); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [time1.c]", + "Syntax: time1 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + + set_sw_parms(-0.0123); + mu=0.0785f; + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + assign_ud2u(); + assign_swd2sw(); + + nflds=(int)((4*1024*1024)/(VOLUME*sizeof(float)))+1; + if ((nflds%2)==1) + nflds+=1; + alloc_ws(nflds); + ps=reserve_ws(nflds); + + for (i=0;i +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,count,nt; + int i,nflds; + double phi[2],phi_prime[2]; + double mu,wt1,wt2,wdt; + spinor_dble **psd; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time2.log","w",stdout); + + printf("\n"); + printf("Timing of Dw_dble() and Dwhat_dble()\n"); + printf("------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + if (NPROC>1) + printf("There are %d MPI processes\n",NPROC); + else + printf("There is 1 MPI process\n"); + + if ((VOLUME*sizeof(double))<(64*1024)) + { + printf("The local size of the gauge field is %d KB\n", + (int)((72*VOLUME*sizeof(double))/(1024))); + printf("The local size of a quark field is %d KB\n", + (int)((24*VOLUME*sizeof(double))/(1024))); + } + else + { + printf("The local size of the gauge field is %d MB\n", + (int)((72*VOLUME*sizeof(double))/(1024*1024))); + printf("The local size of a quark field is %d MB\n", + (int)((24*VOLUME*sizeof(double))/(1024*1024))); + } + +#if (defined x64) +#if (defined AVX) + printf("Using AVX instructions\n"); +#else + printf("Using SSE3 instructions and 16 xmm registers\n"); +#endif +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + printf("\n"); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [time2.c]", + "Syntax: time2 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + + set_sw_parms(-0.0123); + mu=0.0785; + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + + nflds=(int)((4*1024*1024)/(VOLUME*sizeof(double)))+1; + if ((nflds%2)==1) + nflds+=1; + alloc_wsd(nflds); + psd=reserve_wsd(nflds); + + for (i=0;i +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,count,nt; + int n,nb,isw,bs[4]; + float mu; + double phi[2],phi_prime[2]; + double wt1,wt2,wdt; + block_t *b; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time3.log","w",stdout); + fin=freopen("check7.in","r",stdin); + + printf("\n"); + printf("Timing of Dw_blk() and Dwhat_blk()\n"); + printf("----------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + if (NPROC>1) + printf("There are %d MPI processes\n",NPROC); + else + printf("There is 1 MPI process\n"); + + if ((VOLUME*sizeof(float))<(64*1024)) + { + printf("The local size of the gauge field is %d KB\n", + (int)((72*VOLUME*sizeof(float))/(1024))); + printf("The local size of a quark field is %d KB\n", + (int)((24*VOLUME*sizeof(float))/(1024))); + } + else + { + printf("The local size of the gauge field is %d MB\n", + (int)((72*VOLUME*sizeof(float))/(1024*1024))); + printf("The local size of a quark field is %d MB\n", + (int)((24*VOLUME*sizeof(float))/(1024*1024))); + } + +#if (defined x64) +#if (defined AVX) + printf("Using AVX instructions\n"); +#else + printf("Using SSE3 instructions and 16 xmm registers\n"); +#endif +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + printf("\n"); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + fclose(fin); + + printf("bs = %d %d %d %d\n\n",bs[0],bs[1],bs[2],bs[3]); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [time3.c]", + "Syntax: time3 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + set_sap_parms(bs,0,1,1); + alloc_bgr(SAP_BLOCKS); + + set_sw_parms(-0.0123); + mu=0.0785f; + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + assign_ud2ubgr(SAP_BLOCKS); + assign_swd2swbgr(SAP_BLOCKS,NO_PTS); + + b=blk_list(SAP_BLOCKS,&nb,&isw); + random_s((*b).vol,(*b).s[0],1.0f); + + nt=(int)(2.0e6f/(double)(VOLUME)); + if (nt<2) + nt=2; + wdt=0.0; + + while (wdt<5.0) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + for (count=0;count +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,count,nt; + int n,nb,isw,bs[4]; + double phi[2],phi_prime[2]; + double mu,wt1,wt2,wdt; + block_t *b; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time4.log","w",stdout); + fin=freopen("check7.in","r",stdin); + + printf("\n"); + printf("Timing of Dw_blk_dble() and Dwhat_blk_dble()\n"); + printf("--------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + if (NPROC>1) + printf("There are %d MPI processes\n",NPROC); + else + printf("There is 1 MPI process\n"); + + if ((VOLUME*sizeof(double))<(64*1024)) + { + printf("The local size of the gauge field is %d KB\n", + (int)((72*VOLUME*sizeof(double))/(1024))); + printf("The local size of a quark field is %d KB\n", + (int)((24*VOLUME*sizeof(double))/(1024))); + } + else + { + printf("The local size of the gauge field is %d MB\n", + (int)((72*VOLUME*sizeof(double))/(1024*1024))); + printf("The local size of a quark field is %d MB\n", + (int)((24*VOLUME*sizeof(double))/(1024*1024))); + } + +#if (defined x64) +#if (defined AVX) + printf("Using AVX instructions\n"); +#else + printf("Using SSE3 instructions and 16 xmm registers\n"); +#endif +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + printf("\n"); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + fclose(fin); + + printf("bs = %d %d %d %d\n\n",bs[0],bs[1],bs[2],bs[3]); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [time3.c]", + "Syntax: time3 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + set_dfl_parms(bs,4); + alloc_bgr(DFL_BLOCKS); + + set_sw_parms(-0.0123); + mu=0.0785; + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + assign_ud2udblk(DFL_BLOCKS,0); + assign_swd2swdblk(DFL_BLOCKS,0,NO_PTS); + + b=blk_list(DFL_BLOCKS,&nb,&isw); + random_sd((*b).vol,(*b).sd[0],1.0); + + nt=(int)(2.0e6f/(double)(VOLUME)); + if (nt<2) + nt=2; + wdt=0.0; + + while (wdt<5.0) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + for (count=0;count that +allows the type of boundary condition to be chosen at runtime. When the option +is not set, open boundary conditions are assumed. + +The option may be set but has no effect in the case of check5 and check8 (the +boundary conditions are selected through the input parameter file in these +cases). diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..74e1ee1814d120588b7ef6877e7bfd026468356a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/Makefile @@ -0,0 +1,170 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 check4 check5 check6 check7 \ + check8 check9 check10 check11 time1 + +ARCHIVE = archive + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +DFL = dfl_geometry dfl_subspace ltl_gcr dfl_sap_gcr dfl_modes + +DIRAC = Dw_dble Dw Dw_bnd + +FLAGS = flags action_parms dfl_parms force_parms hmc_parms lat_parms \ + sap_parms solver_parms mdint_parms rat_parms + +FORCES = force0 force1 force2 force3 force4 force5 \ + frcfcts genfrc tmcg tmcgm xtensor + +LATTICE = bcnds uidx ftidx geometry + +LINALG = salg salg_dble valg valg_dble liealg cmatrix_dble cmatrix + +LINSOLV = cgne fgcr fgcr4vd mscg + +LITTLE = Aw_gen Aw_com Aw_ops Aw_dble Aw ltl_modes + +MDFLDS = mdflds fcom + +RANDOM = ranlux ranlxs ranlxd gauss + +RATFCTS = elliptic zolotarev ratfcts + +SAP = sap_com sap_gcr sap blk_solv + +SFLDS = sflds scom sdcom Pbnd Pbnd_dble + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +SW_TERM = pauli pauli_dble swflds sw_term + +TCHARGE = ftcom ftensor + +UFLDS = plaq_sum shift uflds udcom bstap + +UPDATE = chrono + +UTILS = endian mutils utils wspace + +VFLDS = vflds vinit vcom vdcom + +MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ + $(LATTICE) $(LINALG) $(LINSOLV) $(LITTLE) $(MDFLDS) $(RANDOM) \ + $(RATFCTS) $(SAP) $(SFLDS) $(SU3FCTS) $(SW_TERM) $(TCHARGE) \ + $(UFLDS) $(UPDATE) $(UTILS) $(VFLDS) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/mdflds:$(MDIR)/su3fcts:\ + $(MDIR)/utils:$(MDIR)/forces:$(MDIR)/sflds:$(MDIR)/dirac:\ + $(MDIR)/sw_term:$(MDIR)/tcharge:$(MDIR)/block:$(MDIR)/sap:\ + $(MDIR)/linsolv:$(MDIR)/dfl:$(MDIR)/vflds:$(MDIR)/little:\ + $(MDIR)/update:$(MDIR)/ratfcts + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + +# -DCGNE_DBG -DFGCR_DBG -DMSCG_DBG +# -DDFL_MODES_DBG + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..0666fd85b96fc0305eaf8164cba2bc57db458554 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check1.c @@ -0,0 +1,433 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Gauge and translation invariance of the gauge action. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,nfc[8],ofs[8]; +static const su3_dble ud0={{0.0}}; +static su3_dble *g,*gbuf; +static su3_dble wd ALIGNED16; + + +static double bnd_action(void) +{ + int i,j; + double c0,c1,*cG,*phi; + double s[3],d0[2],d1[2],act; + lat_parms_t lat; + bc_parms_t bcp; + + if ((bc==1)||(bc==2)) + { + lat=lat_parms(); + bcp=bc_parms(); + + s[0]=(double)(N1); + s[1]=(double)(N2); + s[2]=(double)(N3); + + for (i=0;i<2;i++) + { + d0[i]=0.0; + d1[i]=0.0; + phi=bcp.phi[i]; + + for (j=0;j<3;j++) + { + d0[i]-=(cos(phi[0]/s[j])+cos(phi[1]/s[j])+ + cos(phi[2]/s[j])-3.0); + d1[i]-=(cos(2.0*phi[0]/s[j])+cos(2.0*phi[1]/s[j])+ + cos(2.0*phi[2]/s[j])-3.0); + } + } + + c0=lat.c0; + c1=lat.c1; + cG=bcp.cG; + + act=c0*cG[1]*d0[1]+c1*d0[1]+c1*1.5*d1[1]; + + if (bc==1) + act+=(c0*cG[0]*d0[0]+c1*d0[0]+c1*1.5*d1[0]); + + return (lat.beta/3.0)*(double)(N1*N2*N3)*act; + } + else + return 0.0; +} + + + +static void pack_gbuf(void) +{ + int ifc,ib,ix; + + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=0; + ofs[1]=ofs[0]+nfc[0]; + ofs[2]=ofs[1]+nfc[1]; + ofs[3]=ofs[2]+nfc[2]; + ofs[4]=ofs[3]+nfc[3]; + ofs[5]=ofs[4]+nfc[4]; + ofs[6]=ofs[5]+nfc[5]; + ofs[7]=ofs[6]+nfc[6]; + + for (ifc=0;ifc<8;ifc++) + { + for (ib=0;ib0) + { + tag=mpi_tag(); + saddr=npr[ifc^0x1]; + raddr=npr[ifc]; + sbuf=gbuf+ofs[ifc]; + rbuf=g+VOLUME+ofs[ifc]; + + if (np&0x1) + { + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + } + } +} + + +static void random_g(void) +{ + int ix,t; + su3_dble unity,*gx; + + unity=ud0; + unity.c11.re=1.0; + unity.c22.re=1.0; + unity.c33.re=1.0; + gx=g; + + for (ix=0;ix0)||(bc!=1)) + random_su3_dble(gx); + else + (*gx)=unity; + + gx+=1; + } + + if (BNDRY>0) + { + pack_gbuf(); + send_gbuf(); + } +} + + +static void transform_ud(void) +{ + int ix,iy,t,ifc; + su3_dble *u; + + u=udfld(); + + for (ix=(VOLUME/2);ix(bs[mu]/2)) + svec[mu]-=bs[mu]; + } + + MPI_Bcast(svec,4,MPI_INT,0,MPI_COMM_WORLD); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,n,s[4]; + double phi[2],phi_prime[2],p1,p2; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + + printf("\n"); + printf("Gauge and translation invariance of the gauge action\n"); + printf("----------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check1.c]", + "Syntax: check1 [-bc ]"); + } + + set_lat_parms(3.5,0.33,0,NULL,1.0); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.9012,1.2034,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + + g=amalloc(NSPIN*sizeof(*g),ALIGN); + if (BNDRY!=0) + gbuf=amalloc((BNDRY/2)*sizeof(*gbuf),ALIGN); + + error((g==NULL)||((BNDRY!=0)&&(gbuf==NULL)),1,"main [check1.c]", + "Unable to allocate auxiliary arrays"); + + chs_ubnd(-1); + p1=action0(1); + p2=bnd_action(); + + if (my_rank==0) + { + printf("Action after initialization = %.15e\n",p1); + printf("Expected value = %.15e\n\n",p2); + } + + random_ud(); + chs_ubnd(-1); + p1=action0(1); + random_g(); + transform_ud(); + p2=action0(1); + + if (my_rank==0) + { + printf("Random gauge field:\n"); + printf("Action = %.12e\n",p1); + printf("Gauge invariance: relative difference = %.1e\n\n", + fabs(1.0-p2/p1)); + } + + if (my_rank==0) + printf("Translation invariance:\n"); + + p1=action0(1); + + for (n=0;n<8;n++) + { + random_vec(s); + if (bc!=3) + s[0]=0; + chs_ubnd(1); + shift_ud(s); + chs_ubnd(-1); + p2=action0(1); + + if (my_rank==0) + { + printf("s=(% d, % d,% d,% d), ",s[0],s[1],s[2],s[3]); + printf("relative deviation = %.1e\n",fabs(1.0-p2/p1)); + } + } + + if (my_rank==0) + { + printf("\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check10.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check10.c new file mode 100644 index 0000000000000000000000000000000000000000..3dcd7f2d66ac2606eea15c6c837abb3ec98998ea --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check10.c @@ -0,0 +1,450 @@ + +/******************************************************************************* +* +* File check10.c +* +* Copyright (C) 2012, 2013 Martin Luescher, Stefan Schaefer +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of force4() and action4(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dfl.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) + + +static void rot_ud(double eps) +{ + int bc,ix,t,ifc; + su3_dble *u; + su3_alg_dble *mom; + mdflds_t *mdfs; + + bc=bc_type(); + mdfs=mdflds(); + mom=(*mdfs).mom; + u=udfld(); + + for (ix=(VOLUME/2);ix]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.782); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + chi[0]=0.123; + chi[1]=-0.534; + chi_prime[0]=0.912; + chi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,0.953,1.203,chi,chi_prime); + print_bc_parms(); + + if (my_rank==0) + { + find_section("SAP"); + read_iprms("bs",4,bs); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + set_sap_parms(bs,1,4,5); + + if (my_rank==0) + { + find_section("Deflation subspace"); + read_iprms("bs",4,bs); + read_line("Ns","%d",&Ns); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_parms(bs,Ns); + + if (my_rank==0) + { + find_section("Deflation subspace generation"); + read_line("kappa","%lf",&kappa); + read_line("mu","%lf",&mu); + read_line("ninv","%d",&ninv); + read_line("nmr","%d",&nmr); + read_line("ncy","%d",&ncy); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&ninv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_gen_parms(kappa,mu,ninv,nmr,ncy); + + if (my_rank==0) + { + find_section("Deflation projection"); + read_line("nkv","%d",&nkv); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + } + + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + set_dfl_pro_parms(nkv,nmx,res); + + set_hmc_parms(0,NULL,1,0,NULL,1,1.0); + mnkv=0; + + for (isp=0;isp<3;isp++) + { + read_solver_parms(isp); + sp=solver_parms(isp); + + if (sp.nkv>mnkv) + mnkv=sp.nkv; + } + + if (my_rank==0) + fclose(fin); + + print_solver_parms(&isap,&idfl); + print_sap_parms(1); + print_dfl_parms(0); + + start_ranlux(0,1245); + geometry(); + + set_sw_parms(-0.0123); + mnkv=2*mnkv+2; + if (mnkv<(Ns+2)) + mnkv=Ns+2; + if (mnkv<5) + mnkv=5; + + alloc_ws(mnkv); + alloc_wsd(7); + alloc_wv(2*nkv+2); + alloc_wvd(4); + + for (isw=0;isw<2;isw++) + { + for (isp=0;isp<3;isp++) + { + if (isp==0) + { + mu=1.0; + eps=1.0e-4; + } + else if (isp==1) + { + mu=0.1; + eps=2.0e-4; + } + else + { + mu=0.01; + eps=3.0e-4; + } + + random_ud(); + chs_ubnd(-1); + random_mom(); + + if (isp==2) + { + dfl_modes(status); + error_root(status[0]<0,1,"main [check10.c]", + "dfl_modes failed"); + } + + status[0]=0; + status[1]=0; + + act0=setpf4(mu,0,isw,0); + + act1=action4(mu,0,isw,isp,0,status); + error_root((status[0]<0)||(status[1]<0),1, + "main [check10.c]","action4 failed %d ",isp); + + rdmy=fabs(act1-act0); + MPI_Reduce(&rdmy,dev_act,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + rdmy=act1-act0; + MPI_Reduce(&rdmy,dev_act+1,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(dev_act,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rot_ud(eps); + dsdt=dSdt(mu,0,isw,isp,status); + + if (my_rank==0) + { + printf("Solver number %d, isw %d\n", + isp,isw); + + if (isp==0) + printf("Status = %d\n",status[0]); + else if (isp==1) + printf("Status = %d,%d\n",status[0],status[1]); + else + printf("Status = (%d,%d,%d),(%d,%d,%d)\n", + status[0],status[1],status[2],status[3], + status[4],status[5]); + + printf("Absolute action difference |setpf4-action4| = %.1e,", + fabs(dev_act[1])); + printf(" %.1e (local)\n",dev_act[0]); + fflush(flog); + } + + rot_ud(eps); + act0=2.0*action4(mu,0,isw,isp,0,status)/3.0; + rot_ud(-eps); + + rot_ud(-eps); + act1=2.0*action4(mu,0,isw,isp,0,status)/3.0; + rot_ud(eps); + + rot_ud(2.0*eps); + act0-=action4(mu,0,isw,isp,0,status)/12.0; + rot_ud(-2.0*eps); + + rot_ud(-2.0*eps); + act1-=action4(mu,0,isw,isp,0,status)/12.0; + rot_ud(2.0*eps); + + dact=1.2345*(act0-act1)/eps; + dev_frc=dsdt-dact; + sig_loss=-log10(fabs(1.0-act0/act1)); + + rdmy=dsdt; + MPI_Reduce(&rdmy,&dsdt,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dsdt,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=dev_frc; + MPI_Reduce(&rdmy,&dev_frc,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dev_frc,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=sig_loss; + MPI_Reduce(&rdmy,&sig_loss,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&sig_loss,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error_chk(); + + if (my_rank==0) + { + printf("Relative deviation of dS/dt = %.2e ",fabs(dev_frc/dsdt)); + printf("[significance loss = %d digits]\n\n",(int)(sig_loss)); + fflush(flog); + } + } + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check10.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check10.in new file mode 100644 index 0000000000000000000000000000000000000000..ff9c44f39aa004b13ba19ba377713fad35faaa11 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check10.in @@ -0,0 +1,43 @@ + +[Solver 0] +solver CGNE +nmx 256 +res 1.0e-12 + +[Solver 1] +solver SAP_GCR +nmx 128 +nkv 16 +isolv 0 +nmr 4 +ncy 3 +res 1.0e-12 + +[Solver 2] +solver DFL_SAP_GCR +nmx 64 +nkv 16 +isolv 1 +nmr 4 +ncy 5 +res 1.0e-12 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 8 + +[Deflation subspace generation] +kappa 0.1350 +mu 0.01 +ninv 5 +nmr 4 +ncy 5 + +[Deflation projection] +nkv 16 +nmx 64 +res 1.0e-2 + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check11.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check11.c new file mode 100644 index 0000000000000000000000000000000000000000..b3466b0c78314eabdcf484ff07300fde984791d7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check11.c @@ -0,0 +1,453 @@ + +/******************************************************************************* +* +* File check11.c +* +* Copyright (C) 2012, 2013 Martin Luescher, Stefan Schaefer +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of force5() and action5(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dfl.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) + + +static void rot_ud(double eps) +{ + int bc,ix,t,ifc; + su3_dble *u; + su3_alg_dble *mom; + mdflds_t *mdfs; + + bc=bc_type(); + mdfs=mdflds(); + mom=(*mdfs).mom; + u=udfld(); + + for (ix=(VOLUME/2);ix]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.782); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + chi[0]=0.123; + chi[1]=-0.534; + chi_prime[0]=0.912; + chi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,0.953,1.203,chi,chi_prime); + print_bc_parms(); + + if (my_rank==0) + { + find_section("SAP"); + read_iprms("bs",4,bs); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + set_sap_parms(bs,1,4,5); + + if (my_rank==0) + { + find_section("Deflation subspace"); + read_iprms("bs",4,bs); + read_line("Ns","%d",&Ns); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_parms(bs,Ns); + + if (my_rank==0) + { + find_section("Deflation subspace generation"); + read_line("kappa","%lf",&kappa); + read_line("mu","%lf",&mu); + read_line("ninv","%d",&ninv); + read_line("nmr","%d",&nmr); + read_line("ncy","%d",&ncy); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&ninv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_gen_parms(kappa,mu,ninv,nmr,ncy); + + if (my_rank==0) + { + find_section("Deflation projection"); + read_line("nkv","%d",&nkv); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + } + + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + set_dfl_pro_parms(nkv,nmx,res); + + set_hmc_parms(0,NULL,1,0,NULL,1,1.0); + mnkv=0; + + for (isp=0;isp<3;isp++) + { + read_solver_parms(isp); + sp=solver_parms(isp); + + if (sp.nkv>mnkv) + mnkv=sp.nkv; + } + + if (my_rank==0) + fclose(fin); + + print_solver_parms(&isap,&idfl); + print_sap_parms(1); + print_dfl_parms(0); + + start_ranlux(0,1245); + geometry(); + + set_sw_parms(-0.0123); + mnkv=2*mnkv+2; + if (mnkv<(Ns+2)) + mnkv=Ns+2; + if (mnkv<5) + mnkv=5; + + alloc_ws(mnkv); + alloc_wsd(7); + alloc_wv(2*nkv+2); + alloc_wvd(4); + + for (isw=0;isw<2;isw++) + { + for (isp=0;isp<3;isp++) + { + if (isp==0) + { + mu0=1.0; + mu1=1.5; + eps=1.0e-4; + } + else if (isp==1) + { + mu0=0.1; + mu1=0.25; + eps=2.0e-4; + } + else + { + mu0=0.01; + mu1=0.02; + eps=3.0e-4; + } + + random_ud(); + chs_ubnd(-1); + random_mom(); + + if (isp==2) + { + dfl_modes(status); + error_root(status[0]<0,1,"main [check11.c]", + "dfl_modes failed"); + } + + status[0]=0; + status[1]=0; + + act0=setpf5(mu0,mu1,0,isp,0,status); + + act1=action5(mu0,mu1,0,isp,0,status); + error_root((status[0]<0)||(status[1]<0),1, + "main [check11.c]","action5 failed %d ",isp); + + rdmy=fabs(act1-act0); + MPI_Reduce(&rdmy,dev_act,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + rdmy=act1-act0; + MPI_Reduce(&rdmy,dev_act+1,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(dev_act,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rot_ud(eps); + dsdt=dSdt(mu0,mu1,0,isw,isp,status); + + if (my_rank==0) + { + printf("Solver number %d, isw %d\n", + isp,isw); + + if (isp==0) + printf("Status = %d\n",status[0]); + else if (isp==1) + printf("Status = %d,%d\n",status[0],status[1]); + else + printf("Status = (%d,%d,%d),(%d,%d,%d)\n", + status[0],status[1],status[2],status[3], + status[4],status[5]); + + printf("Absolute action difference |setpf5-action5| = %.1e,", + fabs(dev_act[1])); + printf(" %.1e (local)\n",dev_act[0]); + fflush(flog); + } + + rot_ud(eps); + act0=2.0*action5(mu0,mu1,0,isp,0,status)/3.0; + rot_ud(-eps); + + rot_ud(-eps); + act1=2.0*action5(mu0,mu1,0,isp,0,status)/3.0; + rot_ud(eps); + + rot_ud(2.0*eps); + act0-=action5(mu0,mu1,0,isp,0,status)/12.0; + rot_ud(-2.0*eps); + + rot_ud(-2.0*eps); + act1-=action5(mu0,mu1,0,isp,0,status)/12.0; + rot_ud(2.0*eps); + + dact=1.2345*(act0-act1)/eps; + dev_frc=dsdt-dact; + sig_loss=-log10(fabs(1.0-act0/act1)); + + rdmy=dsdt; + MPI_Reduce(&rdmy,&dsdt,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dsdt,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=dev_frc; + MPI_Reduce(&rdmy,&dev_frc,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dev_frc,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=sig_loss; + MPI_Reduce(&rdmy,&sig_loss,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&sig_loss,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error_chk(); + + if (my_rank==0) + { + printf("Relative deviation of dS/dt = %.2e ",fabs(dev_frc/dsdt)); + printf("[significance loss = %d digits]\n\n",(int)(sig_loss)); + fflush(flog); + } + } + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..c2b841801a3dd1ee5a9edd0491ca67c41f124ba2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check2.c @@ -0,0 +1,459 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2012-2014 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Gauge action of constant Abelian background fields. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,np[4],bo[4]; +static double mt[4][4],inp[4],twopi; +static su3_dble ud0={{0.0}}; + + +static double afld(int *x,int mu) +{ + int nu; + double xt[4],phi; + + xt[0]=(double)(safe_mod(x[0],N0)); + xt[1]=(double)(safe_mod(x[1],N1)); + xt[2]=(double)(safe_mod(x[2],N2)); + xt[3]=(double)(safe_mod(x[3],N3)); + + phi=0.0; + + for (nu=0;nu1) + { + MPI_Reduce(&rs0,s0,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Reduce(&rs1,s1,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(s0,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(s1,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + { + (*s0)=rs0; + (*s1)=rs1; + } +} + + +static double Amt(void) +{ + int mu,nu; + double c0,c1,*cG; + double smt0,smt1,sms0,sms1,pi; + double xl[4],phi,n0,s0,s1,bs0,bs1; + lat_parms_t lat; + bc_parms_t bcp; + + lat=lat_parms(); + c0=lat.c0; + c1=lat.c1; + bcp=bc_parms(); + cG=bcp.cG; + + xl[0]=(double)(N0); + xl[1]=(double)(N1); + xl[2]=(double)(N2); + xl[3]=(double)(N3); + + pi=4.0*atan(1.0); + smt0=0.0; + smt1=0.0; + sms0=0.0; + sms1=0.0; + + for (mu=1;mu<4;mu++) + { + for (nu=0;nu=(VOLUME/2)) + { + x[0]=bo[0]+x0; + x[1]=bo[1]+x1; + x[2]=bo[2]+x2; + x[3]=bo[3]+x3; + + u=udb+8*(ix-(VOLUME/2)); + + for (ifc=0;ifc<8;ifc++) + { + if (ifc&0x1) + x[ifc/2]-=1; + + phi=afld(x,ifc/2); + + if (ifc&0x1) + x[ifc/2]+=1; + + (*u)=ud0; + (*u).c11.re=cos(phi); + (*u).c11.im=sin(phi); + (*u).c22.re=(*u).c11.re; + (*u).c22.im=(*u).c11.im; + (*u).c33.re=cos(-2.0*phi); + (*u).c33.im=sin(-2.0*phi); + u+=1; + } + } + } + } + } + } + + set_bc(); + set_flags(UPDATED_UD); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,i; + double A1,A2,d,dmax; + double phi[2],phi_prime[2]; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + printf("\n"); + printf("Gauge action of constant Abelian background fields\n"); + printf("--------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check2.c]", + "Syntax: check2 [-bc ]"); + } + + set_lat_parms(3.5,0.33,0,NULL,1.0); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.9012,1.2034,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123); + geometry(); + + twopi=8.0*atan(1.0); + + np[0]=N0; + np[1]=N1; + np[2]=N2; + np[3]=N3; + + bo[0]=cpr[0]*L0; + bo[1]=cpr[1]*L1; + bo[2]=cpr[2]*L2; + bo[3]=cpr[3]*L3; + + inp[0]=1.0/(double)(np[0]); + inp[1]=1.0/(double)(np[1]); + inp[2]=1.0/(double)(np[2]); + inp[3]=1.0/(double)(np[3]); + + dmax=0.0; + + for (i=0;i<10;i++) + { + choose_mt(); + set_ud(); + + A1=Amt(); + A2=action0(1); + + if (my_rank==0) + printf("Field no = %2d, A1 = %12.6e, A2 = %12.6e\n",i+1,A1,A2); + + d=fabs(A1-A2)/A1; + if (d>dmax) + dmax=d; + } + + error_chk(); + + if (my_rank==0) + { + printf("\n"); + printf("Maximal relative deviation = %.1e\n\n",dmax); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..d4b69a7868ffcc5bd4d7d9f7fb0a749ac86e34f7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check3.c @@ -0,0 +1,337 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2005, 2008-2013 Martin Luescher, Filippo Palombi +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the programs force0() and action0(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "linalg.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) + + +static void rot_ud(double eps) +{ + int bc,ix,t,ifc; + su3_dble *u; + su3_alg_dble *mom; + mdflds_t *mdfs; + + bc=bc_type(); + mdfs=mdflds(); + mom=(*mdfs).mom; + u=udfld(); + + for (ix=(VOLUME/2);ix]"); + } + + set_lat_parms(3.5,0.33,0,NULL,1.0); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.9012,1.2034,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + alloc_wfd(1); + c=0.789; + chk_chs(c); + + for (k=0;k<4;k++) + { + random_ud(); + chs_ubnd(-1); + random_mom(); + dsdt=dSdt(c); + + eps=1.0e-4; + rot_ud(eps); + act0=2.0*action0(0)/3.0; + rot_ud(-eps); + + rot_ud(-eps); + act1=2.0*action0(0)/3.0; + rot_ud(eps); + + rot_ud(2.0*eps); + act0-=action0(0)/12.0; + rot_ud(-2.0*eps); + + rot_ud(-2.0*eps); + act1-=action0(0)/12.0; + rot_ud(2.0*eps); + + act0*=c; + act1*=c; + + dact=(act0-act1)/eps; + dev_frc=dsdt-dact; + sig_loss=-log10(fabs(1.0-act0/act1)); + + rdmy=dsdt; + MPI_Reduce(&rdmy,&dsdt,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dsdt,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=dev_frc; + MPI_Reduce(&rdmy,&dev_frc,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dev_frc,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=sig_loss; + MPI_Reduce(&rdmy,&sig_loss,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&sig_loss,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + ie=check_bc(0.0); + error_root(ie!=1,1,"main [check3.c]", + "Operations did not preserve boundary conditions"); + + error_chk(); + + if (my_rank==0) + { + printf("Relative deviation of dS/dt = %.2e ",fabs(dev_frc/dsdt)); + printf("[significance loss = %d digits]\n",(int)(sig_loss)); + } + } + + if (my_rank==0) + { + printf("\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check4.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check4.c new file mode 100644 index 0000000000000000000000000000000000000000..eb9ed18331c1c4878fd8948a46f820c9c3fab8eb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check4.c @@ -0,0 +1,737 @@ + +/******************************************************************************* +* +* File check4.c +* +* Copyright (C) 2005, 2008-2013 Martin Luescher, Filippo Palombi, +* Stefan Schaefer +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of sw_frc() and hop_frc(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define MAX_LEVELS 8 +#define BLK_LENGTH 8 + +static int cnt[MAX_LEVELS]; +static double smx[MAX_LEVELS]; + + +static int is_Xt_zero(u3_alg_dble *X) +{ + int ie; + + ie=1; + ie&=((*X).c1==0.0); + ie&=((*X).c2==0.0); + ie&=((*X).c3==0.0); + ie&=((*X).c4==0.0); + ie&=((*X).c5==0.0); + ie&=((*X).c6==0.0); + ie&=((*X).c7==0.0); + ie&=((*X).c8==0.0); + ie&=((*X).c9==0.0); + + return ie; +} + + +static int is_Xv_zero(su3_dble *X) +{ + int ie; + + ie=1; + ie&=((*X).c11.re==0.0); + ie&=((*X).c11.im==0.0); + ie&=((*X).c12.re==0.0); + ie&=((*X).c12.im==0.0); + ie&=((*X).c13.re==0.0); + ie&=((*X).c13.im==0.0); + + ie&=((*X).c21.re==0.0); + ie&=((*X).c21.im==0.0); + ie&=((*X).c22.re==0.0); + ie&=((*X).c22.im==0.0); + ie&=((*X).c23.re==0.0); + ie&=((*X).c23.im==0.0); + + ie&=((*X).c31.re==0.0); + ie&=((*X).c31.im==0.0); + ie&=((*X).c32.re==0.0); + ie&=((*X).c32.im==0.0); + ie&=((*X).c33.re==0.0); + ie&=((*X).c33.im==0.0); + + return ie; +} + + +static int is_frc_zero(su3_alg_dble *f) +{ + int ie; + + ie=1; + ie&=((*f).c1==0.0); + ie&=((*f).c2==0.0); + ie&=((*f).c3==0.0); + ie&=((*f).c4==0.0); + ie&=((*f).c5==0.0); + ie&=((*f).c6==0.0); + ie&=((*f).c7==0.0); + ie&=((*f).c8==0.0); + + return ie; +} + + +static void check_Xtbnd(ptset_t set) +{ + int bc,ix,t,n,ie; + int ia,ib; + u3_alg_dble **xt; + + bc=bc_type(); + xt=xtensor(); + ie=0; + ia=0; + ib=VOLUME; + + if (set==EVEN_PTS) + ib=(VOLUME/2); + else if (set==ODD_PTS) + ia=(VOLUME/2); + else if (set==NO_PTS) + ia=VOLUME; + + for (ix=0;ix=ia)&&(ix1.0) + c=pow(4.0+swp.m0,-6.0); + else + c=1.0; + + for (n=0;nvol) + im=vol; + p=1.0; + + for (;ix0)||(bc==3))&&((t<(N0-1))||(bc!=0))) + { + z=det_pauli_dble(0.0,m); + p*=(c*z.re); + z=det_pauli_dble(0.0,m+1); + p*=(c*z.re); + } + + m+=2; + } + + cnt[0]+=1; + smx[0]-=log(fabs(p)); + + for (n=1;(cnt[n-1]>=BLK_LENGTH)&&(n]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.782); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + chi[0]=0.123; + chi[1]=-0.534; + chi_prime[0]=0.912; + chi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,0.953,1.203,chi,chi_prime); + print_bc_parms(); + + start_ranlux(0,1245); + geometry(); + + set_sw_parms(-0.0123); + alloc_wsd(6); + phi=reserve_wsd(1); + + for (k=1;k<=4;k++) + { + random_ud(); + chs_ubnd(-1); + random_mom(); + random_sd(VOLUME,phi[0],1.0); + bnd_sd2zero(ALL_PTS,phi[0]); + dsdt=dSdt(k,phi); + + eps=5.0e-5; + rot_ud(eps); + act0=2.0*action(k,phi)/3.0; + rot_ud(-eps); + + rot_ud(-eps); + act1=2.0*action(k,phi)/3.0; + rot_ud(eps); + + rot_ud(2.0*eps); + act0-=action(k,phi)/12.0; + rot_ud(-2.0*eps); + + rot_ud(-2.0*eps); + act1-=action(k,phi)/12.0; + rot_ud(2.0*eps); + + s[0]=dsdt-(act0-act1)/eps; + s[1]=dsdt; + + if (NPROC>1) + { + MPI_Reduce(s,r,2,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(r,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + { + r[0]=s[0]; + r[1]=s[1]; + } + + dev_frc=fabs(r[0]/r[1]); + sig_loss=-log10(fabs(1.0-act0/act1)); + + error_chk(); + + if (my_rank==0) + { + printf("Calculation of the force for S=(phi,Q^%d*phi):\n",k); + printf("Relative deviation of dS/dt = %.2e ",dev_frc); + printf("[significance loss = %d digits]\n\n",(int)(sig_loss)); + } + } + + if (my_rank==0) + printf("Calculation of the force for S=-2*Tr{ln(SW term)}:\n"); + + for (k=0;k<4;k++) + { + if (k==0) + set=NO_PTS; + else if (k==1) + set=EVEN_PTS; + else if (k==2) + set=ODD_PTS; + else + set=ALL_PTS; + + random_ud(); + chs_ubnd(-1); + random_mom(); + dsdt=dSdt_det(set); + + eps=5.0e-4; + rot_ud(eps); + act0=2.0*action_det(set)/3.0; + rot_ud(-eps); + + rot_ud(-eps); + act1=2.0*action_det(set)/3.0; + rot_ud(eps); + + rot_ud(2.0*eps); + act0-=action_det(set)/12.0; + rot_ud(-2.0*eps); + + rot_ud(-2.0*eps); + act1-=action_det(set)/12.0; + rot_ud(2.0*eps); + + s[0]=dsdt-(act0-act1)/eps; + s[1]=dsdt; + + if (NPROC>1) + { + MPI_Reduce(s,r,2,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(r,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + { + r[0]=s[0]; + r[1]=s[1]; + } + + if (k>0) + { + dev_frc=fabs(r[0]/r[1]); + sig_loss=-log10(fabs(1.0-act0/act1)); + } + else + dev_frc=fabs(r[0]); + + error_chk(); + + if (my_rank==0) + { + if (k==0) + printf("set=NO_PTS: "); + else if (k==1) + printf("set=EVEN_PTS: "); + else if (k==2) + printf("set=ODD_PTS: "); + else + printf("set=ALL_PTS: "); + + if (k>0) + { + printf("relative deviation of dS/dt = %.2e ",dev_frc); + printf("[significance loss = %d digits]\n",(int)(sig_loss)); + } + else + printf("absolute deviation of dS/dt = %.2e\n",dev_frc); + } + } + + if (my_rank==0) + { + printf("\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check5.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check5.c new file mode 100644 index 0000000000000000000000000000000000000000..0643311962385df4b02ecb1985f3d15436ed31a4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check5.c @@ -0,0 +1,296 @@ + +/******************************************************************************* +* +* File check5.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check and performance of the CG solver. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "archive.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "linsolv.h" +#include "forces.h" +#include "global.h" + +static int my_rank,bc,first,last,step,nmx; +static double kappa,csw,mu,cF,cF_prime; +static double phi[2],phi_prime[2],m0,res; +static char cnfg_dir[NAME_SIZE],cnfg_file[NAME_SIZE],nbase[NAME_SIZE]; + + +static void Dhatop_dble(spinor_dble *s,spinor_dble *r) +{ + Dwhat_dble(mu,s,r); + mulg5_dble(VOLUME/2,r); + mu=-mu; +} + + +static void Dhatop(spinor *s,spinor *r) +{ + Dwhat((float)(mu),s,r); + mulg5(VOLUME/2,r); + mu=-mu; +} + + +int main(int argc,char *argv[]) +{ + int nsize,icnfg,status,ie; + double rho,nrm,del; + double wt1,wt2,wdt; + complex_dble z; + spinor **ws; + spinor_dble **wsd,**psd; + lat_parms_t lat; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check5.log","w",stdout); + fin=freopen("check5.in","r",stdin); + + printf("\n"); + printf("Check and performance of the CG solver\n"); + printf("--------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + find_section("Configurations"); + read_line("name","%s",nbase); + read_line("cnfg_dir","%s",cnfg_dir); + read_line("first","%d",&first); + read_line("last","%d",&last); + read_line("step","%d",&step); + + find_section("Lattice parameters"); + read_line("kappa","%lf",&kappa); + read_line("csw","%lf",&csw); + read_line("mu","%lf",&mu); + + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + cF=1.0; + cF_prime=1.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + if (bc!=3) + read_line("cF","%lf",&cF); + + if (bc==2) + read_line("cF'","%lf",&cF_prime); + else + cF_prime=cF; + + find_section("CG"); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + + fclose(fin); + } + + MPI_Bcast(nbase,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(cnfg_dir,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(&first,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&last,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&step,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&csw,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + lat=set_lat_parms(5.5,1.0,1,&kappa,csw); + print_lat_parms(); + + set_bc_parms(bc,1.0,1.0,cF,cF_prime,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + + m0=lat.m0[0]; + set_sw_parms(m0); + + if (my_rank==0) + { + printf("mu = %.6f\n\n",mu); + + printf("CG parameters:\n"); + printf("nmx = %d\n",nmx); + printf("res = %.2e\n\n",res); + + printf("Configurations %sn%d -> %sn%d in steps of %d\n\n", + nbase,first,nbase,last,step); + fflush(flog); + } + + alloc_ws(5); + alloc_wsd(6); + psd=reserve_wsd(3); + + error_root(((last-first)%step)!=0,1,"main [check5.c]", + "last-first is not a multiple of step"); + check_dir_root(cnfg_dir); + nsize=name_size("%s/%sn%d",cnfg_dir,nbase,last); + error_root(nsize>=NAME_SIZE,1,"main [check5.c]", + "configuration file name is too long"); + + for (icnfg=first;icnfg<=last;icnfg+=step) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + import_cnfg(cnfg_file); + + if (my_rank==0) + { + printf("Configuration no %d\n\n",icnfg); + fflush(flog); + } + + chs_ubnd(-1); + random_sd(VOLUME,psd[0],1.0); + bnd_sd2zero(ALL_PTS,psd[0]); + nrm=sqrt(norm_square_dble(VOLUME,1,psd[0])); + assign_sd2sd(VOLUME,psd[0],psd[2]); + + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + rho=tmcg(nmx,res,mu,psd[0],psd[1],&status); + + MPI_Barrier(MPI_COMM_WORLD); + wt2=MPI_Wtime(); + wdt=wt2-wt1; + + error_chk(); + z.re=-1.0; + z.im=0.0; + mulc_spinor_add_dble(VOLUME,psd[2],psd[0],z); + del=norm_square_dble(VOLUME,1,psd[2]); + error_root(del!=0.0,1,"main [check5.c]", + "Source field is not preserved"); + + Dw_dble(mu,psd[1],psd[2]); + mulg5_dble(VOLUME,psd[2]); + Dw_dble(-mu,psd[2],psd[1]); + mulg5_dble(VOLUME,psd[1]); + mulc_spinor_add_dble(VOLUME,psd[1],psd[0],z); + del=sqrt(norm_square_dble(VOLUME,1,psd[1])); + + if (my_rank==0) + { + printf("Solution w/o eo-preconditioning:\n"); + printf("status = %d\n",status); + printf("rho = %.2e, res = %.2e\n",rho,res); + printf("check = %.2e, check = %.2e\n",del,del/nrm); + printf("time = %.2e sec (total)\n",wdt); + if (status>0) + printf(" = %.2e usec (per point and CG iteration)", + (1.0e6*wdt)/((double)(status)*(double)(VOLUME))); + printf("\n\n"); + fflush(flog); + } + + ws=reserve_ws(5); + wsd=reserve_wsd(2); + ie=sw_term(ODD_PTS); + error_root(ie!=0,1,"main [check5.c]", + "Inversion of the SW term failed"); + assign_swd2sw(); + + random_sd(VOLUME/2,psd[0],1.0); + bnd_sd2zero(ALL_PTS,psd[0]); + nrm=sqrt(norm_square_dble(VOLUME/2,1,psd[0])); + assign_sd2sd(VOLUME/2,psd[0],psd[2]); + + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + rho=cgne(VOLUME/2,1,Dhatop,Dhatop_dble,ws,wsd,nmx,res, + psd[0],psd[1],&status); + + MPI_Barrier(MPI_COMM_WORLD); + wt2=MPI_Wtime(); + wdt=wt2-wt1; + + error_chk(); + z.re=-1.0; + z.im=0.0; + mulc_spinor_add_dble(VOLUME/2,psd[2],psd[0],z); + del=norm_square_dble(VOLUME/2,1,psd[2]); + error_root(del!=0.0,1,"main [check5.c]", + "Source field is not preserved"); + + Dhatop_dble(psd[1],psd[2]); + Dhatop_dble(psd[2],psd[1]); + mulc_spinor_add_dble(VOLUME/2,psd[1],psd[0],z); + del=sqrt(norm_square_dble(VOLUME/2,1,psd[1])); + + if (my_rank==0) + { + printf("Solution with eo-preconditioning:\n"); + printf("status = %d\n",status); + printf("rho = %.2e, res = %.2e\n",rho,res); + printf("check = %.2e, check = %.2e\n",del,del/nrm); + printf("time = %.2e sec (total)\n",wdt); + if (status>0) + printf(" = %.2e usec (per point and CG iteration)", + (1.0e6*wdt)/((double)(status)*(double)(VOLUME))); + printf("\n\n"); + fflush(flog); + } + + release_wsd(); + release_ws(); + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check5.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check5.in new file mode 100644 index 0000000000000000000000000000000000000000..a9407b43b0af4193d51d669a6fb92d0835e17bc2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check5.in @@ -0,0 +1,23 @@ + +[Configurations] +name 16x8x8x8b6.00id2 +cnfg_dir /home/data/openQCD/cnfg +first 7 +last 7 +step 1 + +[Lattice parameters] +kappa 0.1280 +csw 1.2 +mu 1.0 + +[Boundary conditions] +type 0 +#phi 0.12 -0.56 +#phi' 0.92 0.76 +cF 0.95 +#cF' 0.90 + +[CG] +nmx 256 +res 1.0e-12 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check6.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check6.c new file mode 100644 index 0000000000000000000000000000000000000000..9cc69cc7319d107075a7790aa061500ce03be274 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check6.c @@ -0,0 +1,444 @@ + +/******************************************************************************* +* +* File check6.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of force1() and action1(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dfl.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) + + +static void rot_ud(double eps) +{ + int bc,ix,t,ifc; + su3_dble *u; + su3_alg_dble *mom; + mdflds_t *mdfs; + + bc=bc_type(); + mdfs=mdflds(); + mom=(*mdfs).mom; + u=udfld(); + + for (ix=(VOLUME/2);ix]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.782); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + chi[0]=0.123; + chi[1]=-0.534; + chi_prime[0]=0.912; + chi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,0.953,1.203,chi,chi_prime); + print_bc_parms(); + + if (my_rank==0) + { + find_section("SAP"); + read_iprms("bs",4,bs); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + set_sap_parms(bs,1,4,5); + + if (my_rank==0) + { + find_section("Deflation subspace"); + read_iprms("bs",4,bs); + read_line("Ns","%d",&Ns); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_parms(bs,Ns); + + if (my_rank==0) + { + find_section("Deflation subspace generation"); + read_line("kappa","%lf",&kappa); + read_line("mu","%lf",&mu); + read_line("ninv","%d",&ninv); + read_line("nmr","%d",&nmr); + read_line("ncy","%d",&ncy); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&ninv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_gen_parms(kappa,mu,ninv,nmr,ncy); + + if (my_rank==0) + { + find_section("Deflation projection"); + read_line("nkv","%d",&nkv); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + } + + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + set_dfl_pro_parms(nkv,nmx,res); + + set_hmc_parms(0,NULL,1,0,NULL,1,1.0); + mnkv=0; + + for (isp=0;isp<3;isp++) + { + read_solver_parms(isp); + sp=solver_parms(isp); + + if (sp.nkv>mnkv) + mnkv=sp.nkv; + } + + if (my_rank==0) + fclose(fin); + + print_solver_parms(&isap,&idfl); + print_sap_parms(1); + print_dfl_parms(0); + + start_ranlux(0,1245); + geometry(); + + set_sw_parms(-0.0123); + mnkv=2*mnkv+2; + if (mnkv<(Ns+2)) + mnkv=Ns+2; + if (mnkv<5) + mnkv=5; + + alloc_ws(mnkv); + alloc_wsd(6); + alloc_wv(2*nkv+2); + alloc_wvd(4); + + for (isp=0;isp<3;isp++) + { + if (isp==0) + { + mu=1.0; + eps=1.0e-4; + } + else if (isp==1) + { + mu=0.1; + eps=2.0e-4; + } + else + { + mu=0.01; + eps=3.0e-4; + } + + random_ud(); + chs_ubnd(-1); + random_mom(); + + if (isp==2) + { + dfl_modes(status); + error_root(status[0]<0,1,"main [check6.c]", + "dfl_modes failed"); + } + + status[0]=0; + status[1]=0; + + act0=setpf1(mu,0,0); + act1=action1(mu,0,isp,0,status); + error_root((status[0]<0)||(status[1]<0),1,"main [check6.c]", + "action1 failed (mu = %.2e, isp=%d)",mu,isp); + + rdmy=fabs(act1-act0); + MPI_Reduce(&rdmy,dev_act,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + rdmy=act1-act0; + MPI_Reduce(&rdmy,dev_act+1,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(dev_act,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + dsdt=dSdt(mu,0,isp,status); + + if (my_rank==0) + { + printf("Solver number %d, mu = %.2e\n",isp,mu); + + if (isp==0) + printf("Status = %d\n",status[0]); + else if (isp==1) + printf("Status = %d,%d\n",status[0],status[1]); + else + printf("Status = (%d,%d,%d),(%d,%d,%d)\n", + status[0],status[1],status[2],status[3], + status[4],status[5]); + + printf("Absolute action difference |setpf1-action1| = %.1e,", + fabs(dev_act[1])); + printf(" %.1e (local)\n",dev_act[0]); + fflush(flog); + } + + rot_ud(eps); + act0=2.0*action1(mu,0,isp,0,status)/3.0; + rot_ud(-eps); + + rot_ud(-eps); + act1=2.0*action1(mu,0,isp,0,status)/3.0; + rot_ud(eps); + + rot_ud(2.0*eps); + act0-=action1(mu,0,isp,0,status)/12.0; + rot_ud(-2.0*eps); + + rot_ud(-2.0*eps); + act1-=action1(mu,0,isp,0,status)/12.0; + rot_ud(2.0*eps); + + dact=1.2345*(act0-act1)/eps; + dev_frc=dsdt-dact; + sig_loss=-log10(fabs(1.0-act0/act1)); + + rdmy=dsdt; + MPI_Reduce(&rdmy,&dsdt,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dsdt,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=dev_frc; + MPI_Reduce(&rdmy,&dev_frc,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dev_frc,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=sig_loss; + MPI_Reduce(&rdmy,&sig_loss,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&sig_loss,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error_chk(); + + if (my_rank==0) + { + printf("Relative deviation of dS/dt = %.2e ",fabs(dev_frc/dsdt)); + printf("[significance loss = %d digits]\n\n",(int)(sig_loss)); + fflush(flog); + } + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check6.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check6.in new file mode 100644 index 0000000000000000000000000000000000000000..e7981ba77309d6d191c28b2c21d276e222934314 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check6.in @@ -0,0 +1,42 @@ + +[Solver 0] +solver CGNE +nmx 256 +res 1.0e-12 + +[Solver 1] +solver SAP_GCR +nmx 128 +nkv 16 +isolv 0 +nmr 4 +ncy 3 +res 1.0e-12 + +[Solver 2] +solver DFL_SAP_GCR +nmx 64 +nkv 16 +isolv 1 +nmr 4 +ncy 5 +res 1.0e-12 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 8 + +[Deflation subspace generation] +kappa 0.1350 +mu 0.01 +ninv 5 +nmr 4 +ncy 5 + +[Deflation projection] +nkv 16 +nmx 64 +res 1.0e-2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check7.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check7.c new file mode 100644 index 0000000000000000000000000000000000000000..ac36c292560f6242d8f47328f828aa1307532369 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check7.c @@ -0,0 +1,449 @@ + +/******************************************************************************* +* +* File check7.c +* +* Copyright (C) 2011-2013 Stefan Schaefer, Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of force2() and action2(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dfl.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) + + +static void rot_ud(double eps) +{ + int bc,ix,t,ifc; + su3_dble *u; + su3_alg_dble *mom; + mdflds_t *mdfs; + + bc=bc_type(); + mdfs=mdflds(); + mom=(*mdfs).mom; + u=udfld(); + + for (ix=(VOLUME/2);ix]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.782); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + chi[0]=0.123; + chi[1]=-0.534; + chi_prime[0]=0.912; + chi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,0.953,1.203,chi,chi_prime); + print_bc_parms(); + + if (my_rank==0) + { + find_section("SAP"); + read_iprms("bs",4,bs); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + set_sap_parms(bs,1,4,5); + + if (my_rank==0) + { + find_section("Deflation subspace"); + read_iprms("bs",4,bs); + read_line("Ns","%d",&Ns); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_parms(bs,Ns); + + if (my_rank==0) + { + find_section("Deflation subspace generation"); + read_line("kappa","%lf",&kappa); + read_line("mu","%lf",&mu); + read_line("ninv","%d",&ninv); + read_line("nmr","%d",&nmr); + read_line("ncy","%d",&ncy); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&ninv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_gen_parms(kappa,mu,ninv,nmr,ncy); + + if (my_rank==0) + { + find_section("Deflation projection"); + read_line("nkv","%d",&nkv); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + } + + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + set_dfl_pro_parms(nkv,nmx,res); + + set_hmc_parms(0,NULL,1,0,NULL,1,1.0); + mnkv=0; + + for (isp=0;isp<3;isp++) + { + read_solver_parms(isp); + sp=solver_parms(isp); + + if (sp.nkv>mnkv) + mnkv=sp.nkv; + } + + if (my_rank==0) + fclose(fin); + + print_solver_parms(&isap,&idfl); + print_sap_parms(1); + print_dfl_parms(0); + + start_ranlux(0,1245); + geometry(); + + set_sw_parms(-0.0123); + mnkv=2*mnkv+2; + if (mnkv<(Ns+2)) + mnkv=Ns+2; + if (mnkv<5) + mnkv=5; + + alloc_ws(mnkv); + alloc_wsd(6); + alloc_wv(2*nkv+2); + alloc_wvd(4); + + for (isp=0;isp<3;isp++) + { + if (isp==0) + { + mu0=1.0; + mu1=1.5; + eps=1.0e-4; + } + else if (isp==1) + { + mu0=0.1; + mu1=0.25; + eps=2.0e-4; + } + else + { + mu0=0.01; + mu1=0.02; + eps=3.0e-4; + } + + random_ud(); + chs_ubnd(-1); + random_mom(); + + if (isp==2) + { + dfl_modes(status); + error_root(status[0]<0,1,"main [check7.c]", + "dfl_modes failed"); + } + + status[0]=0; + status[1]=0; + + act0=setpf2(mu0,mu1,0,isp,0,status); + error_root((status[0]<0)||(status[1]<0),1,"main [check7.c]", + "setpf2 failed (isp,mu0,mu1=%d,%.2e,%.2e)",isp,mu0,mu1); + act1=action2(mu0,mu1,0,isp,0,status); + error_root((status[0]<0)||(status[1]<0),1,"main [check7.c]", + "action2 failed (isp,mu0,mu1=%d,%.2e,%.2e)",isp,mu0,mu1); + + rdmy=fabs(act1-act0); + MPI_Reduce(&rdmy,dev_act,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + rdmy=act1-act0; + MPI_Reduce(&rdmy,dev_act+1,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(dev_act,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + dsdt=dSdt(mu0,mu1,0,isp,status); + + if (my_rank==0) + { + printf("Solver number %d, mu0 = %.2e, mu1 = %.2e\n",isp,mu0,mu1); + + if (isp==0) + printf("Status = %d\n",status[0]); + else if (isp==1) + printf("Status = %d,%d\n",status[0],status[1]); + else + printf("Status = (%d,%d,%d),(%d,%d,%d)\n", + status[0],status[1],status[2],status[3], + status[4],status[5]); + + printf("Absolute action difference |setpf2-action2| = %.1e,", + fabs(dev_act[1])); + printf(" %.1e (local)\n",dev_act[0]); + fflush(flog); + } + + rot_ud(eps); + act0=2.0*action2(mu0,mu1,0,isp,0,status)/3.0; + rot_ud(-eps); + + rot_ud(-eps); + act1=2.0*action2(mu0,mu1,0,isp,0,status)/3.0; + rot_ud(eps); + + rot_ud(2.0*eps); + act0-=action2(mu0,mu1,0,isp,0,status)/12.0; + rot_ud(-2.0*eps); + + rot_ud(-2.0*eps); + act1-=action2(mu0,mu1,0,isp,0,status)/12.0; + rot_ud(2.0*eps); + + dact=1.2345*(act0-act1)/eps; + dev_frc=dsdt-dact; + sig_loss=-log10(fabs(1.0-act0/act1)); + + rdmy=dsdt; + MPI_Reduce(&rdmy,&dsdt,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dsdt,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=dev_frc; + MPI_Reduce(&rdmy,&dev_frc,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dev_frc,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=sig_loss; + MPI_Reduce(&rdmy,&sig_loss,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&sig_loss,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error_chk(); + + if (my_rank==0) + { + printf("Relative deviation of dS/dt = %.2e ",fabs(dev_frc/dsdt)); + printf("[significance loss = %d digits]\n\n",(int)(sig_loss)); + fflush(flog); + } + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check8.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check8.c new file mode 100644 index 0000000000000000000000000000000000000000..49310a55bd4debd53e6a6c3044570f09834f4447 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check8.c @@ -0,0 +1,288 @@ + +/******************************************************************************* +* +* File check8.c +* +* Copyright (C) 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check and performance of the multi-shift CG solver. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "archive.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "forces.h" +#include "global.h" + +static int my_rank,bc,first,last,step; +static int nmu,nmx; +static double kappa,csw,*mu,cF,cF_prime; +static double uphi[2],uphi_prime[2],m0,*res; +static char cnfg_dir[NAME_SIZE],cnfg_file[NAME_SIZE],nbase[NAME_SIZE]; + + +int main(int argc,char *argv[]) +{ + int nsize,icnfg,status,k,ie; + double nrm,del; + double wt1,wt2,wdt; + spinor_dble *eta,*chi,*phi,**psi,**wsd,**rsd; + lat_parms_t lat; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check8.log","w",stdout); + fin=freopen("check8.in","r",stdin); + + printf("\n"); + printf("Check and performance of the multi-shift CG solver\n"); + printf("--------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + find_section("Configurations"); + read_line("name","%s",nbase); + read_line("cnfg_dir","%s",cnfg_dir); + read_line("first","%d",&first); + read_line("last","%d",&last); + read_line("step","%d",&step); + + find_section("Lattice parameters"); + read_line("kappa","%lf",&kappa); + read_line("csw","%lf",&csw); + nmu=count_tokens("mu"); + + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + uphi[0]=0.0; + uphi[1]=0.0; + uphi_prime[0]=0.0; + uphi_prime[1]=0.0; + cF=1.0; + cF_prime=1.0; + + if (bc==1) + read_dprms("uphi",2,uphi); + + if ((bc==1)||(bc==2)) + read_dprms("uphi'",2,uphi_prime); + + if (bc!=3) + read_line("cF","%lf",&cF); + + if (bc==2) + read_line("cF'","%lf",&cF_prime); + else + cF_prime=cF; + } + + MPI_Bcast(nbase,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(cnfg_dir,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(&first,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&last,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&step,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&csw,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&nmu,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(uphi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(uphi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + mu=malloc(2*nmu*sizeof(*mu)); + error(mu==NULL,1,"main [check8.c]","Unable to allocate auxiliary arrays"); + res=mu+nmu; + + if (my_rank==0) + { + find_section("Lattice parameters"); + read_dprms("mu",nmu,mu); + + find_section("CG"); + read_line("nmx","%d",&nmx); + error_root(nmu!=count_tokens("res"),1,"main [check8.c]", + "The numbers of twisted masses and residues do not match"); + read_dprms("res",nmu,res); + + fclose(fin); + } + + MPI_Bcast(mu,nmu,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(res,nmu,MPI_DOUBLE,0,MPI_COMM_WORLD); + + lat=set_lat_parms(5.5,1.0,1,&kappa,csw); + print_lat_parms(); + + set_bc_parms(bc,1.0,1.0,cF,cF_prime,uphi,uphi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + + m0=lat.m0[0]; + set_sw_parms(m0); + + if (my_rank==0) + { + printf("mu = %.6f",mu[0]); + for (k=1;k %sn%d in steps of %d\n\n", + nbase,first,nbase,last,step); + fflush(flog); + } + + if (nmu==1) + alloc_wsd(8); + else + alloc_wsd(5+2*nmu); + + wsd=reserve_wsd(2); + eta=wsd[0]; + chi=wsd[1]; + psi=reserve_wsd(nmu); + + error_root(((last-first)%step)!=0,1,"main [check8.c]", + "last-first is not a multiple of step"); + check_dir_root(cnfg_dir); + nsize=name_size("%s/%sn%d",cnfg_dir,nbase,last); + error_root(nsize>=NAME_SIZE,1,"main [check8.c]", + "configuration file name is too long"); + ie=0; + + for (icnfg=first;icnfg<=last;icnfg+=step) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + import_cnfg(cnfg_file); + + if (my_rank==0) + { + printf("Configuration no %d\n\n",icnfg); + fflush(flog); + } + + chs_ubnd(-1); + random_sd(VOLUME,eta,1.0); + bnd_sd2zero(ALL_PTS,eta); + nrm=sqrt(norm_square_dble(VOLUME/2,1,eta)); + assign_sd2sd(VOLUME,eta,chi); + + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + tmcgm(nmx,res,nmu,mu,eta,psi,&status); + + MPI_Barrier(MPI_COMM_WORLD); + wt2=MPI_Wtime(); + wdt=wt2-wt1; + + error_chk(); + mulr_spinor_add_dble(VOLUME,chi,eta,-1.0); + del=norm_square_dble(VOLUME,1,chi); + error_root(del!=0.0,1,"main [check8.c]", + "Source field is not preserved"); + + if (my_rank==0) + { + printf("status = %d\n",status); + printf("time = %.2e sec (total)\n",wdt); + if (status>0) + printf(" = %.2e usec (per point and CG iteration)\n", + (1.0e6*wdt)/((double)(status)*(double)(VOLUME))); + fflush(flog); + error_root(status<0,1,"main [check8.c]", + "Solver did not converge"); + printf("residues = "); + } + + rsd=reserve_wsd(1); + phi=rsd[0]; + status=0; + + for (k=0;k +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dfl.h" +#include "forces.h" +#include "global.h" + + +#define N0 (NPROC0*L0) + + +static void rot_ud(double eps) +{ + int bc,ix,t,ifc; + su3_dble *u; + su3_alg_dble *mom; + mdflds_t *mdfs; + + bc=bc_type(); + mdfs=mdflds(); + mom=(*mdfs).mom; + u=udfld(); + + for (ix=(VOLUME/2);ix]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.782); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + chi[0]=0.123; + chi[1]=-0.534; + chi_prime[0]=0.912; + chi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,0.953,1.203,chi,chi_prime); + print_bc_parms(); + + read_rat_parms(0); + + if (my_rank==0) + { + find_section("SAP"); + read_iprms("bs",4,bs); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + set_sap_parms(bs,1,4,5); + + if (my_rank==0) + { + find_section("Deflation subspace"); + read_iprms("bs",4,bs); + read_line("Ns","%d",&Ns); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_parms(bs,Ns); + + if (my_rank==0) + { + find_section("Deflation subspace generation"); + read_line("kappa","%lf",&kappa); + read_line("mu","%lf",&mu); + read_line("ninv","%d",&ninv); + read_line("nmr","%d",&nmr); + read_line("ncy","%d",&ncy); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&ninv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_gen_parms(kappa,mu,ninv,nmr,ncy); + + if (my_rank==0) + { + find_section("Deflation projection"); + read_line("nkv","%d",&nkv); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + } + + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + set_dfl_pro_parms(nkv,nmx,res); + + set_hmc_parms(0,NULL,1,0,NULL,1,1.0); + mnkv=0; + + for (isp=0;isp<3;isp++) + { + read_solver_parms(isp); + sp=solver_parms(isp); + + if (sp.nkv>mnkv) + mnkv=sp.nkv; + } + + if (my_rank==0) + fclose(fin); + + print_rat_parms(); + print_solver_parms(&isap,&idfl); + print_sap_parms(1); + print_dfl_parms(0); + + start_ranlux(0,1245); + geometry(); + + set_sw_parms(-0.0123); + rp=rat_parms(0); + irat[0]=0; + + mnkv=2*mnkv+2; + if (mnkv<(Ns+2)) + mnkv=Ns+2; + if (mnkv<5) + mnkv=5; + + alloc_ws(mnkv); + + if (2*rp.degree>4) + alloc_wsd(2*rp.degree+3); + else + alloc_wsd(7); + + alloc_wv(2*nkv+2); + alloc_wvd(4); + + for (isw=0;isw<2;isw++) + { + for (isp=0;isp<3;isp++) + { + if (isp==0) + { + irat[1]=0; + irat[2]=rp.degree/3; + eps=1.0e-4; + } + else if (isp==1) + { + irat[1]=rp.degree/3+1; + irat[2]=(2*rp.degree)/3; + eps=2.0e-4; + } + else + { + irat[1]=(2*rp.degree)/3+1; + irat[2]=rp.degree-1; + eps=3.0e-4; + } + + random_ud(); + chs_ubnd(-1); + random_mom(); + + if (isp==2) + { + dfl_modes(status); + error_root(status[0]<0,1,"main [check9.c]", + "dfl_modes failed"); + } + + status[0]=0; + status[1]=0; + + act0=setpf3(irat,0,isw,isp,0,status); + error_root((status[0]<0)||(status[1]<0),1, + "main [check9.c]","setpf3 failed " + "(irat=(%d,%d,%d), isp=%d)",irat[0],irat[1],irat[2],isp); + + act1=action3(irat,0,isw,isp,0,status); + error_root((status[0]<0)||(status[1]<0),1, + "main [check9.c]","action3 failed " + "(irat=(%d,%d,%d), isp=%d)",irat[0],irat[1],irat[2],isp); + + rdmy=act1-act0; + MPI_Reduce(&rdmy,&dev_act,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dev_act,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rot_ud(eps); + dsdt=dSdt(irat,0,isw,isp,status); + + if (my_rank==0) + { + printf("Solver number %d, poles %d,..,%d, isw %d\n", + isp,irat[1],irat[2],isw); + + if (isp==0) + printf("Status = %d\n",status[0]); + else if (isp==1) + printf("Status = %d,%d\n",status[0],status[1]); + else + printf("Status = (%d,%d,%d),(%d,%d,%d)\n", + status[0],status[1],status[2],status[3], + status[4],status[5]); + + printf("Absolute action difference |setpf3-action3| = %.1e\n", + fabs(dev_act)); + fflush(flog); + } + + rot_ud(eps); + act0=2.0*action3(irat,0,isw,isp,0,status)/3.0; + rot_ud(-eps); + + rot_ud(-eps); + act1=2.0*action3(irat,0,isw,isp,0,status)/3.0; + rot_ud(eps); + + rot_ud(2.0*eps); + act0-=action3(irat,0,isw,isp,0,status)/12.0; + rot_ud(-2.0*eps); + + rot_ud(-2.0*eps); + act1-=action3(irat,0,isw,isp,0,status)/12.0; + rot_ud(2.0*eps); + + dact=1.2345*(act0-act1)/eps; + dev_frc=dsdt-dact; + sig_loss=-log10(fabs(1.0-act0/act1)); + + rdmy=dsdt; + MPI_Reduce(&rdmy,&dsdt,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dsdt,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=dev_frc; + MPI_Reduce(&rdmy,&dev_frc,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dev_frc,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=sig_loss; + MPI_Reduce(&rdmy,&sig_loss,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&sig_loss,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error_chk(); + + if (my_rank==0) + { + printf("Relative deviation of dS/dt = %.2e ",fabs(dev_frc/dsdt)); + printf("[significance loss = %d digits]\n\n",(int)(sig_loss)); + fflush(flog); + } + } + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check9.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check9.in new file mode 100644 index 0000000000000000000000000000000000000000..448bc613a73b1e96f8d3fe5975d910dd75af462e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check9.in @@ -0,0 +1,47 @@ + +[Rational 0] +degree 12 +range 0.001 7.9 + +[Solver 0] +solver MSCG +nmx 256 +res 1.0e-12 + +[Solver 1] +solver SAP_GCR +nmx 128 +nkv 16 +isolv 0 +nmr 4 +ncy 3 +res 1.0e-12 + +[Solver 2] +solver DFL_SAP_GCR +nmx 64 +nkv 16 +isolv 1 +nmr 4 +ncy 5 +res 1.0e-12 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 8 + +[Deflation subspace generation] +kappa 0.1350 +mu 0.01 +ninv 5 +nmr 4 +ncy 5 + +[Deflation projection] +nkv 16 +nmx 64 +res 1.0e-2 + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/time1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/time1.c new file mode 100644 index 0000000000000000000000000000000000000000..7d44aed8057e3b91864783c947ef2daee14295eb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/time1.c @@ -0,0 +1,229 @@ + +/******************************************************************************* +* +* File time1.c +* +* Copyright (C) 2005, 2008-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of plaq_frc(), sw_frc() and hop_frc(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "mdflds.h" +#include "forces.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,n,count; + double phi[2],phi_prime[2]; + double wt1,wt2,wdt; + FILE *flog=NULL; + spinor_dble **wsd; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time1.log","w",stdout); + + printf("\n"); + printf("Timing of plaq_frc(), sw_frc() and hop_frc()\n"); + printf("--------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [time1.c]", + "Syntax: time1 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + + set_sw_parms(-0.1235); + alloc_wsd(2); + wsd=reserve_wsd(2); + + random_ud(); + chs_ubnd(-1); + random_sd(VOLUME,wsd[0],1.0); + random_sd(VOLUME,wsd[1],1.0); + bnd_sd2zero(ALL_PTS,wsd[0]); + bnd_sd2zero(ALL_PTS,wsd[1]); + + plaq_frc(); + set_frc2zero(); + set_xt2zero(); + add_prod2xt(-0.5,wsd[0],wsd[1]); + add_prod2xv(-0.5,wsd[0],wsd[1]); + sw_frc(1.0); + hop_frc(1.0); + + n=(int)(3.0e6/(double)(4*VOLUME)); + if (n<2) + n=2; + wdt=0.0; + + while (wdt<5.0) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + for (count=0;count that allows the +type of boundary condition to be chosen at runtime. When the option is not +set, open boundary conditions are assumed. + +The option may be set but has no effect in the case of check1. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..9476981cd86ad9ab1abc85fb4334dff22acc0fb8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/Makefile @@ -0,0 +1,128 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 + +FLAGS = flags lat_parms dfl_parms + +LATTICE = bcnds uidx geometry + +RANDOM = ranlux ranlxs ranlxd gauss random_su3 + +UFLDS = plaq_sum shift uflds udcom bstap + +SFLDS = sflds + +SU3FCTS = su3prod su3ren cm3x3 + +UTILS = endian mutils utils wspace + +MODULES = $(FLAGS) $(LATTICE) $(RANDOM) $(UFLDS) $(SFLDS) $(SU3FCTS) \ + $(UTILS) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/random:$(MDIR)/uflds:\ + $(MDIR)/sflds:$(MDIR)/su3fcts:$(MDIR)/utils + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..b4723d6850f8fe3d4cf0b8934538a9309e18b651 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/check1.c @@ -0,0 +1,327 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2005, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Consistency checks on the global index arrays cpr,...,map +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "lattice.h" +#include "global.h" + +#define NPROC_BLK (NPROC0_BLK*NPROC1_BLK*NPROC2_BLK*NPROC3_BLK) + +static int ip_test[NPROC]; +static int ix_test[VOLUME]; +static int ia[2][9]; + +static void set_ia(void) +{ + int ifc; + + ia[0][0]=0; + ia[0][1]=ia[0][0]+(FACE0/2); + ia[0][2]=ia[0][1]+(FACE0/2); + ia[0][3]=ia[0][2]+(FACE1/2); + ia[0][4]=ia[0][3]+(FACE1/2); + ia[0][5]=ia[0][4]+(FACE2/2); + ia[0][6]=ia[0][5]+(FACE2/2); + ia[0][7]=ia[0][6]+(FACE3/2); + ia[0][8]=ia[0][7]+(FACE3/2); + + for (ifc=0;ifc<9;ifc++) + ia[1][ifc]=ia[0][ifc]+(BNDRY/2); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,itest; + int in,ir,n[4]; + int mu,ix,x0,x1,x2,x3; + int iy0,iy1,iy2,iy3,iz0,iz1,iz2,iz3; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + + printf("\n"); + printf("Consistency checks on the global index arrays cpr,...,map\n"); + printf("---------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d local lattice\n",L0,L1,L2,L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d grid blocks\n\n", + NPROC0_BLK,NPROC1_BLK,NPROC2_BLK,NPROC3_BLK); + } + + geometry(); + set_ia(); + + error(my_rank!=ipr_global(cpr),1, + "main [check1.c]","Processor coordinates are incorrect"); + + if (my_rank==0) + { + for (in=0;in=(ir+NPROC_BLK))) + itest=2; + } + + error(itest==1,1, + "main [check1.c]","ipr_global is process dependent"); + + error(itest==2,1, + "main [check1.c]","Processes are not properly blocked"); + + n[0]=cpr[0]; + n[1]=cpr[1]; + n[2]=cpr[2]; + n[3]=cpr[3]; + + for (mu=0;mu<4;mu++) + { + n[mu]-=1; + if (npr[2*mu]!=ipr_global(n)) + itest=1; + n[mu]+=2; + if (npr[2*mu+1]!=ipr_global(n)) + itest=1; + n[mu]-=1; + } + + error(itest==1,1, + "main [check1.c]","npr is incorrect"); + + for (ix=0;ix=VOLUME)) + itest=1; + else + ix_test[ix]+=1; + } + } + } + } + + error(itest==1,1, + "main [check1.c]","The index ipt is out of range"); + + for (ix=0;ix=(VOLUME/2)))||((ir==1)&&(ix<(VOLUME/2)))) + itest=1; + + ir=(ir+1)%2; + iy0=iup[ix][0]; + iz0=ipt[x3+L3*x2+L2*L3*x1+L1*L2*L3*((x0+1)%L0)]; + + if ((x0==(L0-1))&&(NPROC0>1)) + { + iy0-=VOLUME; + if ((iy0=ia[ir][2])) + itest=2; + else + iy0=map[iy0]; + } + + iy1=iup[ix][1]; + iz1=ipt[x3+L3*x2+L2*L3*((x1+1)%L1)+L1*L2*L3*x0]; + + if ((x1==(L1-1))&&(NPROC1>1)) + { + iy1-=VOLUME; + if ((iy1=ia[ir][4])) + itest=2; + else + iy1=map[iy1]; + } + + iy2=iup[ix][2]; + iz2=ipt[x3+L3*((x2+1)%L2)+L2*L3*x1+L1*L2*L3*x0]; + + if ((x2==(L2-1))&&(NPROC2>1)) + { + iy2-=VOLUME; + if ((iy2=ia[ir][6])) + itest=2; + else + iy2=map[iy2]; + } + + iy3=iup[ix][3]; + iz3=ipt[((x3+1)%L3)+L3*x2+L2*L3*x1+L1*L2*L3*x0]; + + if ((x3==(L3-1))&&(NPROC3>1)) + { + iy3-=VOLUME; + if ((iy3=ia[ir][8])) + itest=2; + else + iy3=map[iy3]; + } + + if ((iy0!=iz0)||(iy1!=iz1)||(iy2!=iz2)||(iy3!=iz3)) + itest=3; + + iy0=idn[ix][0]; + iz0=ipt[x3+L3*x2+L2*L3*x1+L1*L2*L3*((x0+L0-1)%L0)]; + + if ((x0==0)&&(NPROC0>1)) + { + iy0-=VOLUME; + if ((iy0=ia[ir][1])) + itest=4; + else + iy0=map[iy0]; + } + + iy1=idn[ix][1]; + iz1=ipt[x3+L3*x2+L2*L3*((x1+L1-1)%L1)+L1*L2*L3*x0]; + + if ((x1==0)&&(NPROC1>1)) + { + iy1-=VOLUME; + if ((iy1=ia[ir][3])) + itest=4; + else + iy1=map[iy1]; + } + + iy2=idn[ix][2]; + iz2=ipt[x3+L3*((x2+L2-1)%L2)+L2*L3*x1+L1*L2*L3*x0]; + + if ((x2==0)&&(NPROC2>1)) + { + iy2-=VOLUME; + if ((iy2=ia[ir][5])) + itest=4; + else + iy2=map[iy2]; + } + + iy3=idn[ix][3]; + iz3=ipt[((x3+L3-1)%L3)+L3*x2+L2*L3*x1+L1*L2*L3*x0]; + + if ((x3==0)&&(NPROC3>1)) + { + iy3-=VOLUME; + if ((iy3=ia[ir][7])) + itest=4; + else + iy3=map[iy3]; + } + + if ((iy0!=iz0)||(iy1!=iz1)||(iy2!=iz2)||(iy3!=iz3)) + itest=5; + } + } + } + } + + error(itest==1,1, + "main [check1.c]","The index ipt does not respect eo ordering"); + error(itest==2,1, + "main [check1.c]","The index iup is out of range at the boundaries"); + error(itest==3,1, + "main [check1.c]","The index iup (combined with map) is incorrect"); + error(itest==4,1, + "main [check1.c]","The index idn is out of range at the boundaries"); + error(itest==5,1, + "main [check1.c]","The index idn (combined with map) is incorrect"); + + if (my_rank==0) + { + printf("The lattice is correctly mapped by the global arrays\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..6e1c93dfd61ef68e0b91d5505cdeb1c36c2b5a1f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/check2.c @@ -0,0 +1,594 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2010, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the programs set_bc(), check_bc() and chs_ubnd(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "uflds.h" +#include "lattice.h" +#include "global.h" + +#define N0 (NPROC0*L0) + + +static void new_fld(int ibnd) +{ + su3_dble *ud,*udm; + + ud=udfld(); + udm=ud+4*VOLUME; + + for (;ud0)&&(t<(N0-1)))|| + ((t==0)&&((ifc==0)||((ifc==1)&&(bc!=0))||((ifc>=2)&&(bc!=1))))|| + ((t==(N0-1))&&(bc!=0))) + ie|=cmp_ud(u,v); + + u+=1; + v+=1; + } + } + + return ie; +} + + +static int check_diag(su3_dble *u) +{ + int i,ie; + double r[18]; + complex_dble z; + + ie=0; + + r[ 0]=(*u).c11.re; + r[ 1]=(*u).c11.im; + r[ 2]=(*u).c12.re; + r[ 3]=(*u).c12.im; + r[ 4]=(*u).c13.re; + r[ 5]=(*u).c13.im; + + r[ 6]=(*u).c21.re; + r[ 7]=(*u).c21.im; + r[ 8]=(*u).c22.re; + r[ 9]=(*u).c22.im; + r[10]=(*u).c23.re; + r[11]=(*u).c23.im; + + r[12]=(*u).c31.re; + r[13]=(*u).c31.im; + r[14]=(*u).c32.re; + r[15]=(*u).c32.im; + r[16]=(*u).c33.re; + r[17]=(*u).c33.im; + + ie|=(fabs(r[ 0]*r[ 0]+r[ 1]*r[ 1]-1.0)>(8.0*DBL_EPSILON)); + ie|=(fabs(r[ 8]*r[ 8]+r[ 9]*r[ 9]-1.0)>(8.0*DBL_EPSILON)); + ie|=(fabs(r[16]*r[16]+r[17]*r[17]-1.0)>(8.0*DBL_EPSILON)); + + z.re=r[0]*r[8]-r[1]*r[9]; + z.im=r[0]*r[9]+r[1]*r[8]; + ie|=(fabs(z.re*r[16]-z.im*r[17]-1.0)>(16.0*DBL_EPSILON)); + ie|=(fabs(z.re*r[17]+z.im*r[16])>(16.0*DBL_EPSILON)); + + for (i=0;i<18;i++) + { + if (((i>1)&&(i<8))||((i>9)&&(i<16))) + ie|=(r[i]!=0.0); + } + + return ie; +} + + +static int check_bval(su3_dble *u) +{ + int bc,ie,ifc; + int ipt,npts,*pts; + + ie=0; + bc=bc_type(); + + if (bc==1) + { + pts=bnd_pts(&npts); + + if (npts>0) + { + pts+=(npts/2); + + ie|=check_diag(u+8*(pts[0]-(VOLUME/2))+2); + ie|=check_diag(u+8*(pts[0]-(VOLUME/2))+4); + ie|=check_diag(u+8*(pts[0]-(VOLUME/2))+6); + + for (ipt=0;ipt<(npts/2);ipt++) + { + for (ifc=2;ifc<8;ifc++) + ie|=cmp_ud(u+8*(pts[0]-(VOLUME/2))+2*(ifc/2), + u+8*(pts[ipt]-(VOLUME/2))+ifc); + } + } + } + + if (((bc==1)||(bc==2))&&(cpr[0]==(NPROC0-1))) + { + u+=4*VOLUME+7*(BNDRY/4); + + ie|=check_diag(u); + ie|=check_diag(u+1); + ie|=check_diag(u+2); + } + + return ie; +} + + +static complex_dble detu(su3_dble *u) +{ + complex_dble z,w; + + z.re= + (*u).c22.re*(*u).c33.re-(*u).c22.im*(*u).c33.im- + (*u).c32.re*(*u).c23.re+(*u).c32.im*(*u).c23.im; + + z.im= + (*u).c22.re*(*u).c33.im+(*u).c22.im*(*u).c33.re- + (*u).c32.re*(*u).c23.im-(*u).c32.im*(*u).c23.re; + + w.re=(*u).c11.re*z.re-(*u).c11.im*z.im; + w.im=(*u).c11.re*z.im+(*u).c11.im*z.re; + + z.re= + (*u).c32.re*(*u).c13.re-(*u).c32.im*(*u).c13.im- + (*u).c12.re*(*u).c33.re+(*u).c12.im*(*u).c33.im; + + z.im= + (*u).c32.re*(*u).c13.im+(*u).c32.im*(*u).c13.re- + (*u).c12.re*(*u).c33.im-(*u).c12.im*(*u).c33.re; + + w.re+=((*u).c21.re*z.re-(*u).c21.im*z.im); + w.im+=((*u).c21.re*z.im+(*u).c21.im*z.re); + + z.re= + (*u).c12.re*(*u).c23.re-(*u).c12.im*(*u).c23.im- + (*u).c22.re*(*u).c13.re+(*u).c22.im*(*u).c13.im; + + z.im= + (*u).c12.re*(*u).c23.im+(*u).c12.im*(*u).c23.re- + (*u).c22.re*(*u).c13.im-(*u).c22.im*(*u).c13.re; + + w.re+=((*u).c31.re*z.re-(*u).c31.im*z.im); + w.im+=((*u).c31.re*z.im+(*u).c31.im*z.re); + + return w; +} + + +static double check_detu(int ibc,su3_dble *u) +{ + int bc,ix,t,ifc; + double d,dmax; + complex_dble z; + + bc=bc_type(); + dmax=0.0; + + for (ix=(VOLUME/2);ixdmax) + dmax=d; + u+=1; + + z=detu(u); + + if ((bc==3)&&(ibc==-1)) + d=fabs(z.re+1.0)+fabs(z.im); + else if (bc==0) + d=fabs(z.re)+fabs(z.im); + else + d=fabs(z.re-1.0)+fabs(z.im); + + if (d>dmax) + dmax=d; + u+=1; + + for (ifc=2;ifc<8;ifc++) + { + z=detu(u); + d=fabs(z.re-1.0)+fabs(z.im); + if (d>dmax) + dmax=d; + u+=1; + } + } + else if (t==(N0-1)) + { + z=detu(u); + + if ((bc==3)&&(ibc==-1)) + d=fabs(z.re+1.0)+fabs(z.im); + else if (bc==0) + d=fabs(z.re)+fabs(z.im); + else + d=fabs(z.re-1.0)+fabs(z.im); + + if (d>dmax) + dmax=d; + u+=1; + + for (ifc=1;ifc<8;ifc++) + { + z=detu(u); + d=fabs(z.re-1.0)+fabs(z.im); + if (d>dmax) + dmax=d; + u+=1; + } + } + else + { + for (ifc=0;ifc<8;ifc++) + { + z=detu(u); + d=fabs(z.re-1.0)+fabs(z.im); + if (d>dmax) + dmax=d; + u+=1; + } + } + } + + if (NPROC>1) + { + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return dmax; +} + + +static int scmp_ud(su3_dble *u,su3_dble *v) +{ + int i; + double r[18]; + + r[ 0]=(*u).c11.re+(*v).c11.re; + r[ 1]=(*u).c11.im+(*v).c11.im; + r[ 2]=(*u).c12.re+(*v).c12.re; + r[ 3]=(*u).c12.im+(*v).c12.im; + r[ 4]=(*u).c13.re+(*v).c13.re; + r[ 5]=(*u).c13.im+(*v).c13.im; + + r[ 6]=(*u).c21.re+(*v).c21.re; + r[ 7]=(*u).c21.im+(*v).c21.im; + r[ 8]=(*u).c22.re+(*v).c22.re; + r[ 9]=(*u).c22.im+(*v).c22.im; + r[10]=(*u).c23.re+(*v).c23.re; + r[11]=(*u).c23.im+(*v).c23.im; + + r[12]=(*u).c31.re+(*v).c31.re; + r[13]=(*u).c31.im+(*v).c31.im; + r[14]=(*u).c32.re+(*v).c32.re; + r[15]=(*u).c32.im+(*v).c32.im; + r[16]=(*u).c33.re+(*v).c33.re; + r[17]=(*u).c33.im+(*v).c33.im; + + for (i=0;i<18;i++) + { + if (r[i]!=0.0) + return 1; + } + + return 0; +} + + +static int cmp_all(int ibc,su3_dble *u,su3_dble *v) +{ + int ix,t,ifc,ie,bc; + + bc=bc_type(); + ie=0; + + for (ix=(VOLUME/2);ix<(VOLUME);ix++) + { + t=global_time(ix); + + if (t==0) + { + ie|=cmp_ud(u,v); + u+=1; + v+=1; + + if ((bc==3)&&(ibc==-1)) + ie|=scmp_ud(u,v); + else + ie|=cmp_ud(u,v); + + u+=1; + v+=1; + + for (ifc=2;ifc<8;ifc++) + { + ie|=cmp_ud(u,v); + u+=1; + v+=1; + } + + } + else if (t==(N0-1)) + { + if ((bc==3)&&(ibc==-1)) + ie|=scmp_ud(u,v); + else + ie|=cmp_ud(u,v); + + u+=1; + v+=1; + + for (ifc=1;ifc<8;ifc++) + { + ie|=cmp_ud(u,v); + u+=1; + v+=1; + } + } + else + { + for (ifc=0;ifc<8;ifc++) + { + ie|=cmp_ud(u,v); + u+=1; + v+=1; + } + } + } + + return ie; +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,ie; + double phi[2],phi_prime[2]; + double cG,cG_prime,cF,cF_prime; + double dev0,dev1; + su3_dble *udb,**usv; + bc_parms_t bcp; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + printf("\n"); + printf("Check of set_bc() and check_bc()\n"); + printf("--------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check2.c]", + "Syntax: check2 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + cG=0.97; + cG_prime=1.056; + cF=0.82; + cF_prime=1.12; + set_bc_parms(bc,cG,cG_prime,cF,cF_prime,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_wud(1); + usv=reserve_wud(1); + udb=udfld(); + + ie=0; + bcp=bc_parms(); + error(bcp.type!=bc,1,"main [check2.c]", + "Type of boundary condition is not properly set"); + + if (bc!=3) + { + ie|=(cG!=bcp.cG[0]); + ie|=(cF!=bcp.cF[0]); + } + + if (bc<=1) + { + ie|=(bcp.cG[0]!=bcp.cG[1]); + ie|=(bcp.cF[0]!=bcp.cF[1]); + } + + if (bc==2) + { + ie|=(cG_prime!=bcp.cG[1]); + ie|=(cF_prime!=bcp.cF[1]); + } + + if (bc==1) + { + ie|=(phi[0]!=bcp.phi[0][0]); + ie|=(phi[1]!=bcp.phi[0][1]); + ie|=(bcp.phi[0][2]!=-bcp.phi[0][0]-bcp.phi[0][1]); + } + + if ((bc==1)||(bc==2)) + { + ie|=(phi_prime[0]!=bcp.phi[1][0]); + ie|=(phi_prime[1]!=bcp.phi[1][1]); + ie|=(bcp.phi[1][2]!=-bcp.phi[1][0]-bcp.phi[1][1]); + } + + error(ie,1,"main [check2.c]","Boundary parameters are not properly set"); + + ie=check_bc(0.0); + error(ie!=1,1,"main [check2.c]", + "check_bc() gives the wrong answer"); + + new_fld(0); + ie=check_bc(0.0); + error(((bc<2)&&(ie!=0))||((bc>=2)&&(ie!=1)),2,"main [check2.c]", + "check_bc() gives the wrong answer"); + + new_fld(1); + ie=check_bc(0.0); + error(((bc<3)&&(ie!=0))||((bc==3)&&(ie!=1)),2,"main [check2.c]", + "check_bc() gives the wrong answer"); + + cm3x3_assign(4*VOLUME,udb,usv[0]); + set_bc(); + ie=check_bc(0.0); + error(ie!=1,2,"main [check2.c]", + "check_bc() gives the wrong answer"); + + ie=cmp_active(udb,usv[0]); + error(ie!=0,2,"main [check2.c]", + "Active link variables are modified by set_bc()"); + + ie=check_bval(udb); + error(ie!=0,2,"main [check2.c]", + "Boundary values are not properly set by set_bc()"); + + random_ud(); + cm3x3_assign(4*VOLUME,udb,usv[0]); + dev0=check_detu(1,udb); + ie=chs_ubnd(-1); + error(((bc==3)&&(ie==0))||((bc!=3)&&(ie==1)),1,"main [check2.c]", + "Incorrect return value of chs_ubnd()"); + dev1=check_detu(-1,udb); + + if (my_rank==0) + { + printf("Maximal deviation |1-det{U}|=%.1e (random field)\n",dev0); + printf(" =%.1e (after chs_ubnd)\n\n",dev1); + } + + ie=cmp_all(-1,udb,usv[0]); + error(ie!=0,3,"main [check2.c]","Incorrect action of chs_ubnd()"); + ie=chs_ubnd(1); + error(((bc==3)&&(ie==0))||((bc!=3)&&(ie==1)),2,"main [check2.c]", + "Incorrect return value of chs_ubnd()"); + ie=cmp_all(1,udb,usv[0]); + error(ie!=0,4,"main [check2.c]","Incorrect action of chs_ubnd()"); + ie=chs_ubnd(1); + error(ie!=0,3,"main [check2.c]", + "Incorrect return value of chs_ubnd()"); + + if (my_rank==0) + { + printf("No errors detected --- all programs work correctly\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..00a2345ac0fc056c1f019b2298aa015c9d975e98 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/check3.c @@ -0,0 +1,218 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the programs bnd_s2zero() and bnd_sd2zero(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "sflds.h" +#include "lattice.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define NFLDS 3 + +typedef union +{ + spinor s; + float r[24]; +} spin_t; + +typedef union +{ + spinor_dble s; + double r[24]; +} spin_dble_t; + + +static int is_zero(spinor *s) +{ + int i,ie; + spin_t *sp; + + sp=(spin_t*)(s); + ie=1; + + for (i=0;i<24;i++) + ie&=((*sp).r[i]==0.0f); + + return ie; +} + + +static int is_zero_dble(spinor_dble *s) +{ + int i,ie; + spin_dble_t *sp; + + sp=(spin_dble_t*)(s); + ie=1; + + for (i=0;i<24;i++) + ie&=((*sp).r[i]==0.0); + + return ie; +} + + +static int check_sbnd(ptset_t set,spinor *s) +{ + int bc,ix,t; + int io,ie; + + bc=bc_type(); + ie=1; + + for (ix=0;ix=(VOLUME/2)))); + + if ((io!=0)&&(((t==0)&&(bc!=3))||((t==(N0-1))&&(bc==0)))) + ie&=is_zero(s); + else + ie&=(is_zero(s)^0x1); + + s+=1; + } + + return ie; +} + + +static int check_sbnd_dble(ptset_t set,spinor_dble *s) +{ + int bc,ix,t; + int io,ie; + + bc=bc_type(); + ie=1; + + for (ix=0;ix=(VOLUME/2)))); + + if ((io!=0)&&(((t==0)&&(bc!=3))||((t==(N0-1))&&(bc==0)))) + ie&=is_zero_dble(s); + else + ie&=(is_zero_dble(s)^0x1); + + s+=1; + } + + return ie; +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,ie,is,k; + double phi[2],phi_prime[2]; + double cG,cG_prime,cF,cF_prime; + spinor **ps; + spinor_dble **psd; + ptset_t set; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check3.log","w",stdout); + printf("\n"); + printf("Check of the programs bnd_s2zero() and bnd_sd2zero()\n"); + printf("----------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check3.c]", + "Syntax: check3 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + cG=0.97; + cG_prime=1.056; + cF=0.82; + cF_prime=1.12; + set_bc_parms(bc,cG,cG_prime,cF,cF_prime,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_ws(NFLDS); + alloc_wsd(NFLDS); + + ps=reserve_ws(NFLDS); + psd=reserve_wsd(NFLDS); + ie=1; + + for (is=0;is<4;is++) + { + if (is==0) + set=EVEN_PTS; + else if (is==1) + set=ODD_PTS; + else if (is==2) + set=ALL_PTS; + else + set=NO_PTS; + + for (k=0;k +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "linalg.h" +#include "global.h" + +#define NMOM 100033 + +static double var[64],var_all[64]; + + +int main(int argc,char *argv[]) +{ + int my_rank,n,i,j; + double dev,dmax,dmax_all; + double nsq1,nsq2,sprod1,sprod2; + double sm,r[8]; + double rn,cij,eij; + su3_dble *M,*m,w; + su3_alg_dble *X,*Y,*x; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + + printf("\n"); + printf("Checks of the programs in the module liealg\n"); + printf("-------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + printf("Number of momenta: %d\n\n",NMOM); + } + + start_ranlux(0,123456); + geometry(); + + X=amalloc(2*NMOM*sizeof(*X),4); + M=amalloc(NMOM*sizeof(*M),4); + error((X==NULL)||(M==NULL),1, + "main [check1.c]","Unable to allocate field arrays"); + Y=X+NMOM; + + set_alg2zero(NMOM,X); + dmax=0.0; + + for (n=0;ndmax) + dmax=dev; + } + + X[n].c3=1.0; + } + + MPI_Reduce(&dmax,&dmax_all,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Check of set_alg2zero():\n\n"); + printf("max|X| = %.1e (should be 0.0)\n\n",dmax_all); + } + + dmax=fabs(norm_square_alg(NMOM,1,X)-4.0*(double)(NMOM*NPROC)); + MPI_Reduce(&dmax,&dmax_all,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Check of norm_square_alg():\n\n"); + printf("Element count = %.1e (should be 0.0)\n\n",dmax_all); + } + + sm=0.0; + dmax=0.0; + + for (n=0;ndmax) + dmax=dev; + + sm+=nsq2; + } + + MPI_Reduce(&dmax,&dmax_all,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("|1.0+2*tr{X^2}/||X||^2| = %.1e (single elements)\n",dmax_all); + printf("(should be less than %.1e or so)\n\n",DBL_EPSILON*sqrt(8.0)); + } + + dmax=fabs(1.0-sm/norm_square_alg(NMOM,0,X)); + MPI_Reduce(&dmax,&dmax_all,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("|1.0+2*tr{X^2}/||X||^2| = %.1e (whole vector)\n",dmax_all); + printf("(should be less than %.1e or so)\n\n", + DBL_EPSILON*sqrt(8.0*(double)(NMOM))); + } + + random_alg(NMOM,X); + random_alg(NMOM,Y); + + nsq1=norm_square_alg(NMOM,1,X); + nsq2=norm_square_alg(NMOM,1,Y); + sprod1=scalar_prod_alg(NMOM,1,X,Y); + + for (n=0;n1)) + { + cij=1.0/4.0; + eij=sqrt(2.0*rn)/4.0; + } + else if (i==j) + { + cij=1.0/9.0; + eij=sqrt(2.0*rn)/9.0; + } + else if ((i==0)&&(j==1)) + { + cij=1.0/18.0; + eij=sqrt(5.0*rn)/18.0; + } + else if ((i<2)&&(j>1)) + { + cij=0.0; + eij=sqrt(rn)/6.0; + } + else + { + cij=0.0; + eij=sqrt(rn)/4.0; + } + + var_all[8*i+j]*=rn; + + if (cij!=0.0) + { + printf(" = % .4e, deviation = %.1e+-%.1e\n", + i,j,var_all[8*i+j],fabs(var_all[8*i+j]-cij),eij); + } + else + { + dev=fabs(var_all[8*i+j])/eij; + + if (dev>dmax) + dmax=dev; + } + } + } + + eij=sqrt(rn)/4.0; + printf("\n"); + printf("For all other i,j, "); + printf("max|| = %.1e (should be %.1e or so)\n\n", + dmax*eij,2.0*eij); + } + + rn=-1.2345; + random_alg(NMOM,X); + random_alg(NMOM,Y); + + nsq1=norm_square_alg(NMOM,1,X); + nsq2=norm_square_alg(NMOM,1,Y); + sprod1=scalar_prod_alg(NMOM,1,X,Y); + + muladd_assign_alg(NMOM,rn,X,Y); + sm=norm_square_alg(NMOM,1,Y)-nsq2-rn*rn*nsq1-2.0*rn*sprod1; + sm=fabs(sm)/nsq1; + + if (my_rank==0) + { + printf("Check of muladd_assign_alg(): %.1e\n",sm); + printf("(should be less than 1.0e-15 or so)\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..a68decfd327a24970637087da57e195b722fc87a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check2.c @@ -0,0 +1,422 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2005, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Checks on the programs in the module salg.c +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "linalg.h" +#include "global.h" + +#define _acc_sp(z,x,y) \ + (z).re+=(double)((x).re*(y).re+(x).im*(y).im); \ + (z).im+=(double)((x).re*(y).im-(x).im*(y).re) + +static complex v[25]; +static spinor *ppk[5]; + + +static complex sp(int vol,spinor *pk,spinor *pl) +{ + complex w; + complex_dble z; + spinor *pm; + + z.re=0.0; + z.im=0.0; + pm=pk+vol; + + for (;pk1)) + { + if (my_rank==0) + { + if (icom==1) + { + printf("Checks with global summation\n"); + printf("============================\n\n"); + } + else + { + printf("Checks without global summation\n"); + printf("===============================\n\n"); + } + } + + for (ieo=0;ieo<3;ieo++) + { + if (my_rank==0) + { + if (ieo==0) + printf("First case: full lattice\n\n"); + else if (ieo==1) + printf("Second case: even points\n\n"); + else + printf("Third case: odd points\n\n"); + } + + vol=VOLUME/2; + off=0; + + if (ieo==0) + vol=VOLUME; + if (ieo==2) + off=VOLUME/2; + + for (i=0;i<10;i++) + random_s(vol,ps[i]+off,1.0f); + + dmax=0.0; + + for (i=0;i<10;i++) + { + pk=ps[i]+off; + pl=ps[9-i]+off; + + if (icom==1) + { + z=sp(vol,pk,pl); + MPI_Reduce(&z.re,&w.re,2,MPI_FLOAT,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&w.re,2,MPI_FLOAT,0,MPI_COMM_WORLD); + } + else + w=sp(vol,pk,pl); + + z=spinor_prod(vol,icom,pk,pl); + r=norm_square(vol,icom,pk)*norm_square(vol,icom,pl); + d=(double)((z.re-w.re)*(z.re-w.re)+(z.im-w.im)*(z.im-w.im)); + d=sqrt(d/(double)(r)); + if (d>dmax) + dmax=d; + + r=spinor_prod_re(vol,icom,pk,pl); + d=fabs((double)(z.re/r-1.0f)); + if (d>dmax) + dmax=d; + + z=spinor_prod(vol,icom,pk,pk); + r=norm_square(vol,icom,pk); + + d=fabs((double)(z.im/r)); + if (d>dmax) + dmax=d; + + d=fabs((double)(z.re/r-1.0f)); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Check of spinor_prod, spinor_prod_re\n"); + printf("and norm_square: %.2e\n\n",dmax); + } + + dmax=0.0; + z.re= 0.345f; + z.im=-0.876f; + zsq=z.re*z.re+z.im*z.im; + + for (i=0;i<9;i++) + { + pk=ps[i]+off; + pl=ps[i+1]+off; + + w=spinor_prod(vol,icom,pk,pl); + r=norm_square(vol,icom,pk)+zsq*norm_square(vol,icom,pl) + +2.0f*(z.re*w.re-z.im*w.im); + mulc_spinor_add(vol,pk,pl,z); + + d=fabs((double)(r/norm_square(vol,icom,pk)-1.0f)); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of spinor_prod, norm_square\n"); + printf("and mulc_spinor_add: %.2e\n\n",dmax); + } + + for (i=0;i<10;i++) + random_s(vol,ps[i]+off,1.0f); + + dmax=0.0; + r=-1.234f; + z.re=-r; + z.im=0.0f; + + for (i=0;i<8;i+=3) + { + pk=ps[i]+off; + pl=ps[i+1]+off; + pj=ps[i+2]+off; + + assign_s2s(vol,pk,pj); + mulr_spinor_add(vol,pk,pl,r); + mulc_spinor_add(vol,pk,pl,z); + mulr_spinor_add(vol,pk,pj,-1.0); + + d=(double)(norm_square(vol,icom,pk)/norm_square(vol,icom,pj)); + d=sqrt(d); + if (d>dmax) + dmax=d; + + assign_s2s(vol,pl,pk); + scale(vol,r,pk); + mulc_spinor_add(vol,pk,pl,z); + + d=(double)(norm_square(vol,icom,pk)/norm_square(vol,icom,pl)); + d=sqrt(d); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of mulr_spinor_add, scale\n"); + printf("and mulc_spinor_add: %.2e\n\n",dmax); + } + + for (i=0;i<10;i++) + random_s(vol,ps[i]+off,1.0f); + + dmax=0.0; + + for (i=0;i<10;i++) + { + pk=ps[i]+off; + + if (i>0) + { + pl=ps[i-1]+off; + project(vol,icom,pk,pl); + z=spinor_prod(vol,icom,pk,pl); + + d=(fabs((double)(z.re))+ + fabs((double)(z.im)))/ + sqrt((double)(norm_square(vol,icom,pk))); + + if (d>dmax) + dmax=d; + } + + normalize(vol,icom,pk); + r=norm_square(vol,icom,pk); + + d=fabs((double)(r-1.0f)); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of spinor_prod, norm_square,\n"); + printf("normalize and project: %.2e\n\n",dmax); + } + + for (i=0;i<5;i++) + { + pk=ps[i]+off; + pl=ps[i+5]+off; + + random_s(vol,ps[i]+off,1.0f); + assign_s2s(vol,pk,pl); + + for (j=0;j<5;j++) + { + v[5*i+j].re=0.1234f*(float)(i^2)-0.8976f*(float)(j); + v[5*i+j].im=0.2231f*(float)(i)+0.9922f*(float)(j^2); + } + + ppk[i]=pl; + } + + rotate(vol,5,ppk,v); + dmax=0.0; + + for (i=5;i<10;i++) + { + pk=ps[i]+off; + + for (j=0;j<5;j++) + { + z.re=-v[5*j+(i-5)].re; + z.im=-v[5*j+(i-5)].im; + + pl=ps[j]+off; + mulc_spinor_add(vol,pk,pl,z); + } + + r=norm_square(vol,icom,pk); + + d=fabs((double)(r)); + if (d>dmax) + dmax=d; + } + + dmax/=(double)(norm_square(vol,icom,ps[0]+off)); + dmax=sqrt(dmax); + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of mulc_spinor_add\n"); + printf("and rotate: %.2e\n\n",dmax); + } + + dmax=0.0; + + for (i=0;i<5;i++) + { + pk=ps[i]+off; + pl=ps[9-i]+off; + random_s(vol,pk,1.0f); + assign_s2s(vol,pk,pl); + mulg5(vol,pk); + mulg5(vol,pk); + + z.re=-1.0f; + z.im=0.0f; + + mulc_spinor_add(vol,pl,pk,z); + r=norm_square(vol,icom,pl)/norm_square(vol,icom,pk); + d=sqrt((double)(r)); + if (d>dmax) + dmax=d; + + random_s(vol,pl,1.0f); + z=spinor_prod(vol,icom,pk,pl); + mulg5(vol,pk); + mulg5(vol,pl); + w=spinor_prod(vol,icom,pk,pl); + + d=(fabs((double)(z.re-w.re))+fabs((double)(z.im-w.im)))/ + (fabs((double)(z.re))+fabs((double)(z.im))); + if (d>dmax) + dmax=d; + + random_s(vol,pk,1.0f); + assign_s2s(vol,pk,pl); + mulg5(vol,pk); + mulmg5(vol,pk); + + z.re=1.0f; + z.im=0.0f; + + mulc_spinor_add(vol,pl,pk,z); + r=norm_square(vol,icom,pl)/norm_square(vol,icom,pk); + d=sqrt((double)(r)); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Check of mulg5 and mulmg5: %.2e\n\n",dmax); + } + } + } + } + + error_chk(); + + if (my_rank==0) + { + printf("Maximal deviation in all tests: %.2e\n\n",dall); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..65242421c7215e5a4eee714f9205a4a2557fea0f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check3.c @@ -0,0 +1,475 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2005, 2011, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Checks on the programs in the module salg_dble.c +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "linalg.h" +#include "global.h" + +#define _acc_sp(z,x,y) \ + (z).re+=(double)((x).re*(y).re+(x).im*(y).im); \ + (z).im+=(double)((x).re*(y).im-(x).im*(y).re) + +static complex_dble v[25]; +static spinor_dble *ppk[5]; + + +static complex_dble sp(int vol,spinor_dble *pk,spinor_dble *pl) +{ + complex_dble z; + spinor_dble *pm; + + z.re=0.0; + z.im=0.0; + pm=pk+vol; + + for (;pk1)) + { + if (my_rank==0) + { + if (icom==1) + { + printf("Checks with global summation\n"); + printf("============================\n\n"); + } + else + { + printf("Checks without global summation\n"); + printf("===============================\n\n"); + } + } + + for (ieo=0;ieo<3;ieo++) + { + if (my_rank==0) + { + if (ieo==0) + printf("First case: full lattice\n\n"); + else if (ieo==1) + printf("Second case: even points\n\n"); + else + printf("Third case: odd points\n\n"); + } + + vol=VOLUME/2; + off=0; + + if (ieo==0) + vol=VOLUME; + if (ieo==2) + off=VOLUME/2; + + for (i=0;i<10;i++) + random_sd(vol,psd[i]+off,1.0); + + dmax=0.0; + + for (i=0;i<10;i++) + { + pk=psd[i]+off; + pl=psd[9-i]+off; + + if (icom==1) + { + z=sp(vol,pk,pl); + MPI_Reduce(&z.re,&w.re,2,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&w.re,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + w=sp(vol,pk,pl); + + z=spinor_prod_dble(vol,icom,pk,pl); + r=norm_square_dble(vol,icom,pk)*norm_square_dble(vol,icom,pl); + d=(z.re-w.re)*(z.re-w.re)+(z.im-w.im)*(z.im-w.im); + d=sqrt(d/r); + if (d>dmax) + dmax=d; + + r=spinor_prod_re_dble(vol,icom,pk,pl); + + d=fabs(z.re/r-1.0); + if (d>dmax) + dmax=d; + + z=spinor_prod_dble(vol,icom,pk,pk); + r=norm_square_dble(vol,icom,pk); + + d=fabs(z.im/r); + if (d>dmax) + dmax=d; + + d=fabs(z.re/r-1.0); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Check of spinor_prod, spinor_prod_re\n"); + printf("and norm_square: %.2e\n\n",dmax); + } + + dmax=0.0; + + for (i=0;i<10;i++) + { + pk=psd[i]+off; + pl=psd[9-i]+off; + + z=spinor_prod5_dble(vol,icom,pk,pl); + mulg5_dble(vol,pl); + w=spinor_prod_dble(vol,icom,pk,pl); + + r=norm_square_dble(vol,icom,pk)*norm_square_dble(vol,icom,pl); + d=(z.re-w.re)*(z.re-w.re)+(z.im-w.im)*(z.im-w.im); + d=sqrt(d/r); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency check of spinor_prod5, mulg5\n"); + printf("and spinor_prod: %.2e\n\n",dmax); + } + + dmax=0.0; + z.re= 0.345; + z.im=-0.876; + zsq=z.re*z.re+z.im*z.im; + + for (i=0;i<9;i++) + { + pk=psd[i]+off; + pl=psd[i+1]+off; + + w=spinor_prod_dble(vol,icom,pk,pl); + r=norm_square_dble(vol,icom,pk)+zsq*norm_square_dble(vol,icom,pl) + +2.0*(z.re*w.re-z.im*w.im); + mulc_spinor_add_dble(vol,pk,pl,z); + + d=fabs(r/norm_square_dble(vol,icom,pk)-1.0); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of spinor_prod, norm_square\n"); + printf("and mulc_spinor_add: %.2e\n\n",dmax); + } + + for (i=0;i<10;i++) + random_sd(vol,psd[i]+off,1.0); + + dmax=0.0; + r=-1.234; + z.re=-r; + z.im=0.0; + + for (i=0;i<8;i+=3) + { + pk=psd[i]+off; + pl=psd[i+1]+off; + pj=psd[i+2]+off; + + assign_sd2sd(vol,pk,pj); + mulr_spinor_add_dble(vol,pk,pl,r); + mulc_spinor_add_dble(vol,pk,pl,z); + mulr_spinor_add_dble(vol,pk,pj,-1.0); + + d=norm_square_dble(vol,icom,pk)/norm_square_dble(vol,icom,pj); + d=sqrt(d); + if (d>dmax) + dmax=d; + + assign_sd2sd(vol,pl,pk); + scale_dble(vol,r,pk); + mulc_spinor_add_dble(vol,pk,pl,z); + + d=norm_square_dble(vol,icom,pk)/norm_square_dble(vol,icom,pl); + d=sqrt(d); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of mulr_spinor_add, scale\n"); + printf("and mulc_spinor_add: %.2e\n\n",dmax); + } + + for (i=0;i<10;i++) + random_sd(vol,psd[i]+off,1.0); + + dmax=0.0; + cs=0.785; + cr=-1.567; + + for (i=0;i<8;i+=3) + { + pk=psd[i]+off; + pl=psd[i+1]+off; + pj=psd[i+2]+off; + + assign_sd2sd(vol,pk,pj); + combine_spinor_dble(vol,pk,pl,cs,cr); + scale_dble(vol,cs,pj); + mulr_spinor_add_dble(vol,pj,pl,cr); + mulr_spinor_add_dble(vol,pk,pj,-1.0); + + d=norm_square_dble(vol,icom,pk)/norm_square_dble(vol,icom,pj); + d=sqrt(d); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of mulr_spinor_add, scale\n"); + printf("and combine_spinor: %.2e\n\n",dmax); + } + + for (i=0;i<10;i++) + random_sd(vol,psd[i]+off,1.0); + + dmax=0.0; + + for (i=0;i<10;i++) + { + pk=psd[i]+off; + + if (i>0) + { + pl=psd[i-1]+off; + project_dble(vol,icom,pk,pl); + z=spinor_prod_dble(vol,icom,pk,pl); + + d=(fabs(z.re)+fabs(z.im))/sqrt(norm_square_dble(vol,icom,pk)); + + if (d>dmax) + dmax=d; + } + + normalize_dble(vol,icom,pk); + r=norm_square_dble(vol,icom,pk); + + d=fabs(r-1.0); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of spinor_prod, norm_square,\n"); + printf("normalize and project: %.2e\n\n",dmax); + } + + for (i=0;i<5;i++) + { + pk=psd[i]+off; + pl=psd[i+5]+off; + + random_sd(vol,psd[i]+off,1.0); + assign_sd2sd(vol,pk,pl); + + for (j=0;j<5;j++) + { + v[5*i+j].re=0.1234*(double)(i^2)-0.8976*(double)(j); + v[5*i+j].im=0.2231*(double)(i)+0.9922*(double)(j^2); + } + + ppk[i]=pl; + } + + rotate_dble(vol,5,ppk,v); + dmax=0.0; + + for (i=5;i<10;i++) + { + pk=psd[i]+off; + + for (j=0;j<5;j++) + { + z.re=-v[5*j+(i-5)].re; + z.im=-v[5*j+(i-5)].im; + + pl=psd[j]+off; + mulc_spinor_add_dble(vol,pk,pl,z); + } + + r=norm_square_dble(vol,icom,pk); + + d=fabs(r); + if (d>dmax) + dmax=d; + } + + dmax/=norm_square_dble(vol,icom,psd[0]+off); + dmax=sqrt(dmax); + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of mulc_spinor_add\n"); + printf("and rotate: %.2e\n\n",dmax); + } + + dmax=0.0; + + for (i=0;i<5;i++) + { + pk=psd[i]+off; + pl=psd[9-i]+off; + random_sd(vol,pk,1.0); + assign_sd2sd(vol,pk,pl); + mulg5_dble(vol,pk); + mulg5_dble(vol,pk); + + z.re=-1.0; + z.im=0.0; + + mulc_spinor_add_dble(vol,pl,pk,z); + r=norm_square_dble(vol,icom,pl)/norm_square_dble(vol,icom,pk); + d=sqrt(r); + if (d>dmax) + dmax=d; + + random_sd(vol,pl,1.0); + z=spinor_prod_dble(vol,icom,pk,pl); + mulg5_dble(vol,pk); + mulg5_dble(vol,pl); + w=spinor_prod_dble(vol,icom,pk,pl); + + d=(fabs(z.re-w.re)+fabs(z.im-w.im))/ + (fabs(z.re)+fabs(z.im)); + if (d>dmax) + dmax=d; + + random_sd(vol,pk,1.0); + assign_sd2sd(vol,pk,pl); + mulg5_dble(vol,pk); + mulmg5_dble(vol,pk); + + z.re=1.0; + z.im=0.0; + + mulc_spinor_add_dble(vol,pl,pk,z); + r=norm_square_dble(vol,icom,pl)/norm_square_dble(vol,icom,pk); + d=sqrt(r); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Check of mulg5 and mulmg5: %.2e\n\n",dmax); + } + } + } + } + + error_chk(); + + if (my_rank==0) + { + printf("Maximal deviation in all tests: %.2e\n\n",dall); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check4.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check4.c new file mode 100644 index 0000000000000000000000000000000000000000..d5bfb3625d4547b7ed825508bb39a4a7773b91e7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check4.c @@ -0,0 +1,321 @@ + +/******************************************************************************* +* +* File check4.c +* +* Copyright (C) 2007, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Consistency checks on the programs in the module valg +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "vflds.h" +#include "linalg.h" +#include "global.h" + + +static complex v[25]; +static complex *ppk[5]; + + +static complex sp(int vol,complex *pk,complex *pl) +{ + int ix; + double x,y; + complex z; + + x=0.0; + y=0.0; + + for (ix=0;ix1)) + { + if (my_rank==0) + { + if (icom==1) + { + printf("Checks with global summation\n"); + printf("============================\n\n"); + } + else + { + printf("Checks without global summation\n"); + printf("===============================\n\n"); + } + } + + for (ieo=0;ieo<3;ieo++) + { + if (my_rank==0) + { + if (ieo==0) + printf("First case: full lattice\n\n"); + else if (ieo==1) + printf("Second case: even points\n\n"); + else + printf("Third case: odd points\n\n"); + } + + vol=nv/2; + off=0; + + if (ieo==0) + vol=nv; + if (ieo==2) + off=nv/2; + + for (i=0;i<10;i++) + random_v(vol,wv[i]+off,1.0f); + + dmax=0.0; + + for (i=0;i<10;i++) + { + pk=wv[i]+off; + pl=wv[9-i]+off; + + if (icom==1) + { + z=sp(vol,pk,pl); + MPI_Reduce(&z.re,&w.re,2,MPI_FLOAT,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&w.re,2,MPI_FLOAT,0,MPI_COMM_WORLD); + } + else + w=sp(vol,pk,pl); + + z=vprod(vol,icom,pk,pl); + r=vnorm_square(vol,icom,pk)*vnorm_square(vol,icom,pl); + d=(double)((z.re-w.re)*(z.re-w.re)+(z.im-w.im)*(z.im-w.im)); + d=sqrt(d/(double)(r)); + if (d>dmax) + dmax=d; + + z=vprod(vol,icom,pk,pk); + r=vnorm_square(vol,icom,pk); + + d=fabs((double)(z.im/r)); + if (d>dmax) + dmax=d; + + d=fabs((double)(z.re/r-1.0f)); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Check of vprod and vnorm_square: %.2e\n\n",dmax); + } + + dmax=0.0; + z.re= 0.345f; + z.im=-0.876f; + zsq=z.re*z.re+z.im*z.im; + + for (i=0;i<9;i++) + { + pk=wv[i]+off; + pl=wv[i+1]+off; + + w=vprod(vol,icom,pk,pl); + r=vnorm_square(vol,icom,pk)+zsq*vnorm_square(vol,icom,pl) + +2.0f*(z.re*w.re-z.im*w.im); + mulc_vadd(vol,pk,pl,z); + + d=fabs((double)(r/vnorm_square(vol,icom,pk)-1.0f)); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of vprod, vnorm_square\n"); + printf("and mulc_vadd: %.2e\n\n",dmax); + } + + for (i=0;i<10;i++) + random_v(vol,wv[i]+off,1.0f); + + dmax=0.0; + + for (i=0;i<10;i++) + { + pk=wv[i]+off; + + if (i>0) + { + pl=wv[i-1]+off; + vproject(vol,icom,pk,pl); + z=vprod(vol,icom,pk,pl); + + d=(fabs((double)(z.re))+ + fabs((double)(z.im)))/ + sqrt((double)(vnorm_square(vol,icom,pk))); + + if (d>dmax) + dmax=d; + } + + vnormalize(vol,icom,pk); + r=vnorm_square(vol,icom,pk); + + d=fabs((double)(r-1.0f)); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of vprod, vnorm_square,\n"); + printf("vnormalize and vproject: %.2e\n\n",dmax); + } + + for (i=0;i<5;i++) + { + pk=wv[i]+off; + pl=wv[i+5]+off; + + random_v(vol,wv[i]+off,1.0f); + assign_v2v(vol,pk,pl); + + for (j=0;j<5;j++) + { + v[5*i+j].re=0.1234f*(float)(i^2)-0.8976f*(float)(j); + v[5*i+j].im=0.2231f*(float)(i)+0.9922f*(float)(j^2); + } + + ppk[i]=pl; + } + + vrotate(vol,5,ppk,v); + dmax=0.0; + + for (i=5;i<10;i++) + { + pk=wv[i]+off; + + for (j=0;j<5;j++) + { + z.re=-v[5*j+(i-5)].re; + z.im=-v[5*j+(i-5)].im; + + pl=wv[j]+off; + mulc_vadd(vol,pk,pl,z); + } + + r=vnorm_square(vol,icom,pk); + + d=fabs((double)(r)); + if (d>dmax) + dmax=d; + } + + dmax/=(double)(vnorm_square(vol,icom,wv[0]+off)); + dmax=sqrt(dmax); + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of mulc_vadd\n"); + printf("and vrotate: %.2e\n\n",dmax); + } + } + } + } + + error_chk(); + + if (my_rank==0) + { + printf("Maximal deviation in all tests: %.2e\n\n",dall); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check4.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check4.in new file mode 100644 index 0000000000000000000000000000000000000000..d3202ab3cc8269cb289e4fdf5e9a62838f8a8119 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check4.in @@ -0,0 +1 @@ +bs 4 4 4 4 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check5.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check5.c new file mode 100644 index 0000000000000000000000000000000000000000..ec9883519ae8fa038bdfe9f22f104586d16c5f1c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check5.c @@ -0,0 +1,321 @@ + +/******************************************************************************* +* +* File check5.c +* +* Copyright (C) 2007, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Checks on the programs in the module valg_dble +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "vflds.h" +#include "linalg.h" +#include "global.h" + +static complex_dble v[25]; +static complex_dble *ppk[5]; + + +static complex_dble sp(int vol,complex_dble *pk,complex_dble *pl) +{ + int ix; + double x,y; + complex_dble z; + + x=0.0; + y=0.0; + + for (ix=0;ix1)) + { + if (my_rank==0) + { + if (icom==1) + { + printf("Checks with global summation\n"); + printf("============================\n\n"); + } + else + { + printf("Checks without global summation\n"); + printf("===============================\n\n"); + } + } + + for (ieo=0;ieo<3;ieo++) + { + if (my_rank==0) + { + if (ieo==0) + printf("First case: full lattice\n\n"); + else if (ieo==1) + printf("Second case: even points\n\n"); + else + printf("Third case: odd points\n\n"); + } + + vol=nv/2; + off=0; + + if (ieo==0) + vol=nv; + if (ieo==2) + off=nv/2; + + for (i=0;i<10;i++) + random_vd(vol,wvd[i]+off,1.0f); + + dmax=0.0; + + for (i=0;i<10;i++) + { + pk=wvd[i]+off; + pl=wvd[9-i]+off; + + if (icom==1) + { + z=sp(vol,pk,pl); + MPI_Reduce(&z.re,&w.re,2,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&w.re,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + w=sp(vol,pk,pl); + + z=vprod_dble(vol,icom,pk,pl); + r=vnorm_square_dble(vol,icom,pk)*vnorm_square_dble(vol,icom,pl); + d=(z.re-w.re)*(z.re-w.re)+(z.im-w.im)*(z.im-w.im); + d=sqrt(d/r); + if (d>dmax) + dmax=d; + + z=vprod_dble(vol,icom,pk,pk); + r=vnorm_square_dble(vol,icom,pk); + + d=fabs(z.im/r); + if (d>dmax) + dmax=d; + + d=fabs(z.re/r-1.0); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Check of vprod_dble and vnorm_square_dble: %.2e\n\n", + dmax); + } + + dmax=0.0; + z.re= 0.345; + z.im=-0.876; + zsq=z.re*z.re+z.im*z.im; + + for (i=0;i<9;i++) + { + pk=wvd[i]+off; + pl=wvd[i+1]+off; + + w=vprod_dble(vol,icom,pk,pl); + r=vnorm_square_dble(vol,icom,pk)+ + zsq*vnorm_square_dble(vol,icom,pl) + +2.0f*(z.re*w.re-z.im*w.im); + mulc_vadd_dble(vol,pk,pl,z); + + d=fabs(r/vnorm_square_dble(vol,icom,pk)-1.0); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of vprod_dble, vnorm_square_dble\n"); + printf("and mulc_vadd_dble: %.2e\n\n",dmax); + } + + for (i=0;i<10;i++) + random_vd(vol,wvd[i]+off,1.0f); + + dmax=0.0; + + for (i=0;i<10;i++) + { + pk=wvd[i]+off; + + if (i>0) + { + pl=wvd[i-1]+off; + vproject_dble(vol,icom,pk,pl); + z=vprod_dble(vol,icom,pk,pl); + + d=(fabs(z.re)+fabs(z.im))/ + sqrt(vnorm_square_dble(vol,icom,pk)); + + if (d>dmax) + dmax=d; + } + + vnormalize_dble(vol,icom,pk); + r=vnorm_square_dble(vol,icom,pk); + + d=fabs(r-1.0); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of vprod_dble, vnorm_square_dble,\n"); + printf("vnormalize_dble and vproject_dble: %.2e\n\n",dmax); + } + + for (i=0;i<5;i++) + { + pk=wvd[i]+off; + pl=wvd[i+5]+off; + + random_vd(vol,wvd[i]+off,1.0f); + assign_vd2vd(vol,pk,pl); + + for (j=0;j<5;j++) + { + v[5*i+j].re=0.1234*(double)(i^2)-0.8976*(double)(j); + v[5*i+j].im=0.2231*(double)(i)+0.9922*(double)(j^2); + } + + ppk[i]=pl; + } + + vrotate_dble(vol,5,ppk,v); + dmax=0.0; + + for (i=5;i<10;i++) + { + pk=wvd[i]+off; + + for (j=0;j<5;j++) + { + z.re=-v[5*j+(i-5)].re; + z.im=-v[5*j+(i-5)].im; + + pl=wvd[j]+off; + mulc_vadd_dble(vol,pk,pl,z); + } + + r=vnorm_square_dble(vol,icom,pk); + + d=fabs(r); + if (d>dmax) + dmax=d; + } + + dmax/=vnorm_square_dble(vol,icom,wvd[0]+off); + dmax=sqrt(dmax); + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of mulc_vadd_dble\n"); + printf("and vrotate_dble: %.2e\n\n",dmax); + } + } + } + } + + error_chk(); + + if (my_rank==0) + { + printf("Maximal deviation in all tests: %.2e\n\n",dall); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/time1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/time1.c new file mode 100644 index 0000000000000000000000000000000000000000..de793e3409abc7028faa9a4f0fe1d429e876ac63 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/time1.c @@ -0,0 +1,503 @@ + +/******************************************************************************* +* +* File time1.c +* +* Copyright (C) 2005, 2008, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of the salg routines +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "linalg.h" +#include "global.h" + +static complex *vmat,*wmat; +static spinor **ps,*ppk[5]; + + +static double wt_spinor_prod(int nflds,int icom) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + for (i=0;i2.0) + ib=1; + + wtav/=(double)((nmax*nflds)/2); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_norm_square(int nflds,int icom) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)(nmax*nflds); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_normalize(int nflds,int icom) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)(nmax*nflds); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_mulc_spinor_add(int nflds) +{ + int my_rank,nmax,n,i,ib; + complex z; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + z.re=0.123f; + z.im=0.456f; + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)((nmax*nflds)/2); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_project(int nflds,int icom) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + for (i=0;i2.0) + ib=1; + + wtav/=(double)((nmax*nflds)/2); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static void gram_schmidt(int n,spinor **s) +{ + int i,j,k; + + for (i=0;i2.0) + ib=1; + + wtav/=(double)(2*nmax); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +int main(int argc,char *argv[]) +{ + int my_rank,icom,nflds; + double wdt; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time1.log","w",stdout); + + printf("\n"); + printf("Timing of the salg routines\n"); + printf("---------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + if (NPROC>1) + printf("There are %d MPI processes\n",NPROC); + else + printf("There is 1 MPI process\n"); + + if ((VOLUME*sizeof(float))<(64*1024)) + printf("The local size of a quark field is %d KB\n", + (int)((24*VOLUME*sizeof(float))/(1024))); + else + printf("The local size of a quark field is %d MB\n", + (int)((24*VOLUME*sizeof(float))/(1024*1024))); + +#if (defined x64) +#if (defined AVX) + printf("Using AVX instructions\n"); +#else + printf("Using SSE3 instructions and 16 xmm registers\n"); +#endif +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + printf("\n"); + } + + icom=1; + start_ranlux(0,12345); + geometry(); + + nflds=(int)((4*1024*1024)/(VOLUME*sizeof(float)))+1; + if ((nflds%2)==1) + nflds+=1; + if (nflds<10) + nflds=10; + alloc_ws(nflds); + ps=reserve_ws(nflds); + + wdt=1.0e6*wt_spinor_prod(nflds,icom)/(double)(VOLUME); + + if (my_rank==0) + { + printf("Function spinor_prod:\n"); + printf("Time per lattice point: %4.3f micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(96.0/wdt),(int)(sizeof(spinor))/3); + } + + wdt=1.0e6*wt_norm_square(nflds,icom)/(double)(VOLUME); + + if (my_rank==0) + { + printf("Function norm_square:\n"); + printf("Time per lattice point: %4.3f micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(48.0/wdt),(int)(sizeof(spinor))/3); + } + + wdt=1.0e6*wt_normalize(nflds,icom)/(double)(VOLUME); + + if (my_rank==0) + { + printf("Function normalize:\n"); + printf("Time per lattice point: %4.3f micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(72.0/wdt),(int)(sizeof(spinor))/3); + } + + wdt=1.0e6*wt_mulc_spinor_add(nflds)/(double)(VOLUME); + + if (my_rank==0) + { + printf("Function mulc_spinor_add:\n"); + printf("Time per lattice point: %4.3f micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(96.0/wdt),(int)(sizeof(spinor))/3); + } + + wdt=1.0e6*wt_project(nflds,icom)/(double)(VOLUME); + + if (my_rank==0) + { + printf("Function project:\n"); + printf("Time per lattice point: %4.3f micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(192.0/wdt),(int)(sizeof(spinor))/3); + } + + wdt=1.0e6*wt_rotate()/(double)(25*VOLUME); + error_chk(); + + if (my_rank==0) + { + printf("Function rotate (n=5 fields):\n"); + printf("Time per lattice point: %4.3f*n^2 micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(91.2/wdt),(int)(sizeof(spinor))/3); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/time2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/time2.c new file mode 100644 index 0000000000000000000000000000000000000000..0d4f1eff68034d21824b293481ffde4f46671ed5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/time2.c @@ -0,0 +1,503 @@ + +/******************************************************************************* +* +* File time2.c +* +* Copyright (C) 2005, 2008, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of the salg_dble routines +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "linalg.h" +#include "global.h" + +static complex_dble *vmat,*wmat; +static spinor_dble **psd,*ppk[5]; + + +static double wt_spinor_prod_dble(int nflds,int icom) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + for (i=0;i2.0) + ib=1; + + wtav/=(double)((nmax*nflds)/2); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_norm_square_dble(int nflds,int icom) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)(nmax*nflds); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_normalize_dble(int nflds,int icom) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)(nmax*nflds); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_mulc_spinor_add_dble(int nflds) +{ + int my_rank,nmax,n,i,ib; + complex_dble z; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + z.re=0.123; + z.im=0.456; + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)((nmax*nflds)/2); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_project_dble(int nflds,int icom) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + for (i=0;i2.0) + ib=1; + + wtav/=(double)((nmax*nflds)/2); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static void gram_schmidt(int n,spinor_dble **s) +{ + int i,j,k; + + for (i=0;i2.0) + ib=1; + + wtav/=(double)(2*nmax); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +int main(int argc,char *argv[]) +{ + int my_rank,icom,nflds; + double wdt; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time2.log","w",stdout); + + printf("\n"); + printf("Timing of the salg_dble routines\n"); + printf("--------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + if (NPROC>1) + printf("There are %d MPI processes\n",NPROC); + else + printf("There is 1 MPI process\n"); + + if ((VOLUME*sizeof(double))<(64*1024)) + printf("The local size of a quark field is %d KB\n", + (int)((24*VOLUME*sizeof(double))/(1024))); + else + printf("The local size of a quark field is %d MB\n", + (int)((24*VOLUME*sizeof(double))/(1024*1024))); + +#if (defined x64) +#if (defined AVX) + printf("Using AVX instructions\n"); +#else + printf("Using SSE3 instructions and 16 xmm registers\n"); +#endif +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + printf("\n"); + } + + icom=1; + start_ranlux(0,12345); + geometry(); + + nflds=(int)((4*1024*1024)/(VOLUME*sizeof(double)))+1; + if ((nflds%2)==1) + nflds+=1; + if (nflds<10) + nflds=10; + alloc_wsd(nflds); + psd=reserve_wsd(nflds); + + wdt=1.0e6*wt_spinor_prod_dble(nflds,icom)/(double)(VOLUME); + + if (my_rank==0) + { + printf("Function spinor_prod_dble:\n"); + printf("Time per lattice point: %4.3f micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(96.0/wdt),(int)(sizeof(spinor_dble))/3); + } + + wdt=1.0e6*wt_norm_square_dble(nflds,icom)/(double)(VOLUME); + + if (my_rank==0) + { + printf("Function norm_square_dble:\n"); + printf("Time per lattice point: %4.3f micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(48.0/wdt),(int)(sizeof(spinor_dble))/3); + } + + wdt=1.0e6*wt_normalize_dble(nflds,icom)/(double)(VOLUME); + + if (my_rank==0) + { + printf("Function normalize_dble:\n"); + printf("Time per lattice point: %4.3f micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(72.0/wdt),(int)(sizeof(spinor_dble))/3); + } + + wdt=1.0e6*wt_mulc_spinor_add_dble(nflds)/(double)(VOLUME); + + if (my_rank==0) + { + printf("Function mulc_spinor_add_dble:\n"); + printf("Time per lattice point: %4.3f micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(96.0/wdt),(int)(sizeof(spinor_dble))/3); + } + + wdt=1.0e6*wt_project_dble(nflds,icom)/(double)(VOLUME); + + if (my_rank==0) + { + printf("Function project_dble:\n"); + printf("Time per lattice point: %4.3f micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(192.0/wdt),(int)(sizeof(spinor_dble))/3); + } + + wdt=1.0e6*wt_rotate_dble()/(double)(25*VOLUME); + error_chk(); + + if (my_rank==0) + { + printf("Function rotate_dble (n=5 fields):\n"); + printf("Time per lattice point: %4.3f*n^2 micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(91.2/wdt),(int)(sizeof(spinor_dble))/3); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..8ba001fee9b776ab2c98be6ecfbf114a75ba5fdb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/INDEX @@ -0,0 +1,20 @@ + +Little Dirac operator + +check1 Check of the programs in the module Aw_gen.c. + +check2 Check of the program b2b_flds(). + +check3 Direct check of Aw_dble() and Aw(). + +check4 Consistency checks on Aw_dble(),..,Awhat(). + +check5 Check of the program set_ltl_modes(). + +time1 Timing of Awhat(). + +The programs check2,..,time1 accept the option -bc that allows the +type of boundary condition to be chosen at runtime. When the option is not +set, open boundary conditions are assumed. + +The option may be set but has no effect in the case of check1. \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..ae5f543ffa78dd23b188847a4b3395a7119432f6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/Makefile @@ -0,0 +1,154 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 check4 check5 time1 + +FLAGS = flags lat_parms sap_parms dfl_parms + +LATTICE = bcnds ftidx uidx geometry + +LINALG = salg salg_dble valg valg_dble liealg cmatrix_dble cmatrix + +LINSOLV = fgcr + +RANDOM = ranlux ranlxs ranlxd gauss + +UFLDS = plaq_sum shift uflds udcom + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +UTILS = endian mutils utils wspace + +SFLDS = sflds scom sdcom Pbnd Pbnd_dble + +TCHARGE = ftcom ftensor + +SW_TERM = pauli pauli_dble swflds sw_term + +DIRAC = Dw_dble Dw Dw_bnd + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +SAP = blk_solv sap_com sap sap_gcr + +ARCHIVE = archive + +DFL = dfl_geometry dfl_subspace + +VFLDS = vflds vinit vcom vdcom + +LITTLE = Aw_gen Aw_com Aw_ops Aw_dble Aw ltl_modes + +MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ + $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) $(DIRAC) \ + $(BLOCK) $(SAP) $(ARCHIVE) $(DFL) $(VFLDS) $(LITTLE) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/linsolv:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:\ + $(MDIR)/sflds:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:\ + $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive:$(MDIR)/dfl:\ + $(MDIR)/vflds:$(MDIR)/little + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..ec8a9c6c05f09291e695dfd3e7988daa7f4f2706 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check1.c @@ -0,0 +1,306 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2007, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the programs in the module Aw_gen.c. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "flags.h" +#include "sflds.h" +#include "linalg.h" +#include "little.h" +#include "global.h" + +#define NPTS 2048 + +static int imb[NPTS]; +static su3_dble ud[NPTS],vd[NPTS] ALIGNED16; +static spinor_dble sd[3][NPTS] ALIGNED16; + + +static void random_imb(int vol) +{ + int i,j,a,b; + float r[2],rvol; + + for (i=0;i=vol) + a=vol-1; + b=(int)(rvol*r[1]); + if (b>=vol) + b=vol-1; + + j=imb[a]; + imb[a]=imb[b]; + imb[b]=j; + } +} + + +static void random_ufld(int vol) +{ + int i; + + for (i=0;i +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "dfl.h" +#include "little.h" +#include "global.h" + +static int bs[4],Ns,bc; +static int l[4],np[4]; +static const su3_dble ud0={{0.0}}; + + +static void set_ud(void) +{ + su3_dble unity,*ud,*um; + + unity=ud0; + unity.c11.re=1.0; + unity.c22.re=1.0; + unity.c33.re=1.0; + ud=udfld(); + um=ud+4*VOLUME; + + for (;ud=b[nu])) + ie=2; + } + } + + error(ie!=0,1,"chk_sde0 [check2.c]","Incorrect field components"); +} + + +static void chk_sde1(int mu,int vol,int ibn,int *bo,spinor_dble *sd) +{ + int a[4],b[4],y[4]; + int ix,nu,ie; + + for (nu=0;nu<4;nu++) + { + a[nu]=cpr[nu]*l[nu]+bo[nu]; + b[nu]=a[nu]+bs[nu]; + } + + a[mu]=cpr[mu]*l[mu]+bo[mu]; + if (ibn) + a[mu]=safe_mod(a[mu]+l[mu],np[mu]*l[mu]); + b[mu]=a[mu]+1; + ie=0; + + for (ix=0;ix=b[nu])) + ie=2; + } + } + + error(ie!=0,1,"chk_sde1 [check2.c]","Incorrect field components"); +} + + +static void chk_sdo0(int mu,int vol,int *bo,spinor_dble *sd) +{ + int a[4],b[4],y[4]; + int ix,nu,ie; + + for (nu=0;nu<4;nu++) + { + a[nu]=cpr[nu]*l[nu]+bo[nu]; + b[nu]=a[nu]+bs[nu]; + } + + a[mu]=cpr[mu]*l[mu]+bo[mu]+bs[mu]-1; + b[mu]=a[mu]+1; + ie=0; + + for (ix=0;ix=b[nu])) + ie=2; + } + } + + error(ie!=0,1,"chk_sdo0 [check2.c]","Incorrect field components %d",ie); +} + + +static void chk_sdo1(int mu,int vol,int *bo,spinor_dble *sd) +{ + int a[4],b[4],y[4]; + int ix,nu,ie; + + for (nu=0;nu<4;nu++) + { + a[nu]=cpr[nu]*l[nu]+bo[nu]; + b[nu]=a[nu]+bs[nu]; + } + + a[mu]=cpr[mu]*l[mu]+bo[mu]; + b[mu]=a[mu]+1; + ie=0; + + for (ix=0;ix=b[nu])) + ie=2; + } + } + + error(ie!=0,1,"chk_sdo1 [check2.c]","Incorrect field components"); +} + + +static void cmp_sde0_sdo1(int mu,int vol,int *bo,spinor_dble *sde, + spinor_dble *sdo) +{ + int ye[4],yo[4]; + int ix,nu,ie; + + ie=0; + + for (ix=0;ix]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + Ns=2; + set_dfl_parms(bs,Ns); + alloc_bgr(DFL_BLOCKS); + blk_list(DFL_BLOCKS,&nb,&isw); + + alloc_wsd(Ns); + wsd=reserve_wsd(Ns); + set_ud(); + + for (k=0;k +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "vflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "dfl.h" +#include "little.h" +#include "global.h" + + +static void random_basis(int Ns) +{ + int i; + spinor **ws; + + ws=reserve_ws(Ns); + + for (i=0;i]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + set_sw_parms(-0.0123); + set_dfl_parms(bs,Ns); + mu=0.0376; + + start_ranlux(0,123456); + geometry(); + + alloc_ws(Ns+2); + alloc_wsd(2); + alloc_wv(3); + alloc_wvd(3); + + ws=reserve_ws(2); + wsd=reserve_wsd(2); + wv=reserve_wv(3); + wvd=reserve_wvd(3); + nb=VOLUME/(bs[0]*bs[1]*bs[2]*bs[3]); + nv=Ns*nb; + + random_ud(); + chs_ubnd(-1); + random_basis(Ns); + set_Aw(mu); + sw_term(NO_PTS); + assign_ud2u(); + assign_swd2sw(); + + random_vd(nv,wvd[0],1.0); + Aw_dble(wvd[0],wvd[1]); + dfl_vd2sd(wvd[0],wsd[0]); + Dw_dble(mu,wsd[0],wsd[1]); + dfl_sd2vd(wsd[1],wvd[2]); + + zd.re=-1.0; + zd.im=0.0; + mulc_vadd_dble(nv,wvd[2],wvd[1],zd); + dev=vnorm_square_dble(nv,1,wvd[2])/vnorm_square_dble(nv,1,wvd[1]); + + error_chk(); + + if (my_rank==0) + printf("Relative deviation (Aw_dble) = %.1e\n",sqrt(dev)); + + random_v(nv,wv[0],1.0f); + Aw(wv[0],wv[1]); + dfl_v2s(wv[0],ws[0]); + Dw((float)(mu),ws[0],ws[1]); + dfl_s2v(ws[1],wv[2]); + + z.re=-1.0f; + z.im=0.0f; + mulc_vadd(nv,wv[2],wv[1],z); + dev=(double)(vnorm_square(nv,1,wv[2])/vnorm_square(nv,1,wv[1])); + + error_chk(); + + if (my_rank==0) + { + printf("Relative deviation (Aw) = %.1e\n\n",sqrt(dev)); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check3.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check3.in new file mode 100644 index 0000000000000000000000000000000000000000..4d215004cd2b47624ceeec30351e2dfa500353ac --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check3.in @@ -0,0 +1,2 @@ +bs 4 4 4 4 +Ns 20 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check4.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check4.c new file mode 100644 index 0000000000000000000000000000000000000000..133eeb170fbe185d0a422947811d84a979262ffa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check4.c @@ -0,0 +1,414 @@ + +/******************************************************************************* +* +* File check4.c +* +* Copyright (C) 2007, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Consistency checks on Aw_dble(),..,Awhat(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "vflds.h" +#include "linalg.h" +#include "dirac.h" +#include "dfl.h" +#include "little.h" +#include "global.h" + +static int bc,Ns; + + +static void random_basis(int Ns) +{ + int i; + spinor **ws; + + ws=reserve_ws(Ns); + + for (i=0;i]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + set_sw_parms(0.125); + set_dfl_parms(bs,Ns); + mu=0.0376; + + start_ranlux(0,123456); + geometry(); + + alloc_ws(Ns); + alloc_wv(4); + alloc_wvd(6); + + wv=reserve_wv(4); + wvd=reserve_wvd(4); + nb=VOLUME/(bs[0]*bs[1]*bs[2]*bs[3]); + nv=Ns*nb; + nvh=nv/2; + + random_ud(); + chs_ubnd(-1); + random_basis(Ns); + + ifail=set_Awhat(mu); + error(ifail!=0,1,"main [check4.c]","Inversion of Aee or Aoo failed"); + + zd.re=-1.0; + zd.im=0.0; + z.re=-1.0f; + z.im=0.0f; + + for (iop=0;iop<6;iop++) + { + if (iop==0) + { + op=Awhat; + pr= "Awhat() "; + op_dble=Awhat_dble; + prd="Awhat_dble() "; + } + else if (iop==1) + { + op=Aweeinv; + pr= "Aweeinv()"; + op_dble=Aweeinv_dble; + prd="Aweeinv_dble()"; + } + else if (iop==2) + { + op=Awooinv; + pr= "Awooinv()"; + op_dble=Awooinv_dble; + prd="Awooinv_dble()"; + } + else if (iop==3) + { + op=Awoe; + pr= "Awoe() "; + op_dble=Awoe_dble; + prd="Awoe_dble() "; + } + else if (iop==4) + { + op=Aweo; + pr= "Aweo() "; + op_dble=Aweo_dble; + prd="Aweo_dble() "; + } + else + { + op=Aw; + pr= "Aw() "; + op_dble=Aw_dble; + prd="Aw_dble() "; + } + + random_vd(nv,wvd[0],1.0); + random_vd(nv,wvd[1],1.0); + assign_vd2vd(nv,wvd[0],wvd[2]); + assign_vd2vd(nv,wvd[1],wvd[3]); + + assign_vd2v(nv,wvd[0],wv[0]); + assign_vd2v(nv,wvd[1],wv[1]); + assign_v2v(nv,wv[0],wv[2]); + assign_v2v(nv,wv[1],wv[3]); + + op_dble(wvd[0],wvd[1]); + op(wv[0],wv[1]); + + mulc_vadd_dble(nv,wvd[2],wvd[0],zd); + d=vnorm_square_dble(nv,0,wvd[2]); + error(d!=0.0,1,"main [check4.c]", + "%s modifies the input field",prd); + + mulc_vadd(nv,wv[2],wv[0],z); + d=(double)(vnorm_square(nv,0,wv[2])); + error(d!=0.0,1,"main [check4.c]", + "%s modifies the input field",pr); + + if ((iop<2)||(iop==4)) + { + mulc_vadd_dble(nvh,wvd[3]+nvh,wvd[1]+nvh,zd); + d=vnorm_square_dble(nvh,0,wvd[3]+nvh); + error(d!=0.0,1,"main [check4.c]", + "%s modifies the odd components of the output field",prd); + + mulc_vadd(nvh,wv[3]+nvh,wv[1]+nvh,z); + d=(double)(vnorm_square(nvh,0,wv[3]+nvh)); + error(d!=0.0,1,"main [check4.c]", + "%s modifies the odd components of the output field",pr); + + assign_vd2v(nvh,wvd[1],wv[0]); + mulc_vadd(nvh,wv[0],wv[1],z); + d=(double)(vnorm_square(nvh,1,wv[0])/ + vnorm_square(nvh,1,wv[1])); + if (my_rank==0) + printf("Deviation of %s from %s: %.1e\n",pr,prd,sqrt(d)); + } + + if ((iop==2)||(iop==3)) + { + mulc_vadd_dble(nvh,wvd[3],wvd[1],zd); + d=vnorm_square_dble(nvh,0,wvd[3]); + error(d!=0.0,1,"main [check4.c]", + "%s modifies the even components of the output field",prd); + + mulc_vadd(nvh,wv[3],wv[1],z); + d=(double)(vnorm_square(nvh,0,wv[3])); + error(d!=0.0,1,"main [check4.c]", + "%s modifies the even components of the output field",pr); + + assign_vd2v(nvh,wvd[1]+nvh,wv[0]+nvh); + mulc_vadd(nvh,wv[0]+nvh,wv[1]+nvh,z); + d=(double)(vnorm_square(nvh,1,wv[0]+nvh)/ + vnorm_square(nvh,1,wv[1]+nvh)); + + if (my_rank==0) + printf("Deviation of %s from %s: %.1e\n",pr,prd,sqrt(d)); + } + + if (iop==5) + { + assign_vd2v(nv,wvd[1],wv[0]); + mulc_vadd(nv,wv[0],wv[1],z); + d=(double)(vnorm_square(nv,1,wv[0])/ + vnorm_square(nv,1,wv[1])); + if (my_rank==0) + printf("Deviation of %s from %s: %.1e\n",pr,prd,sqrt(d)); + } + } + + ifail=set_Awhat(-mu); + error(ifail!=0,1,"main [check4.c]","Inversion of Aee or Aoo failed"); + + random_vd(nvh,wvd[0],1.0); + set_vd2zero(nvh,wvd[0]+nvh); + Aw_dble(wvd[0],wvd[1]); + + Aweeinv_dble(wvd[1],wvd[2]); + mulc_vadd_dble(nvh,wvd[2],wvd[0],zd); + d=vnorm_square_dble(nvh,1,wvd[2])/vnorm_square_dble(nvh,1,wvd[0]); + + if (my_rank==0) + { + printf("\n"); + printf("Comparison of Aweeinv_dble() and Aw_dble(): %.1e\n",sqrt(d)); + } + + Awoe_dble(wvd[0],wvd[2]); + mulc_vadd_dble(nvh,wvd[2]+nvh,wvd[1]+nvh,zd); + d=vnorm_square_dble(nvh,1,wvd[2]+nvh)/vnorm_square_dble(nvh,1,wvd[1]+nvh); + + if (my_rank==0) + printf("Comparison of Awoe_dble() and Aw_dble(): %.1e\n",sqrt(d)); + + random_vd(nvh,wvd[0]+nvh,1.0); + set_vd2zero(nvh,wvd[0]); + Aw_dble(wvd[0],wvd[1]); + + Awooinv_dble(wvd[1],wvd[2]); + mulc_vadd_dble(nvh,wvd[2]+nvh,wvd[0]+nvh,zd); + d=vnorm_square_dble(nvh,1,wvd[2]+nvh)/vnorm_square_dble(nvh,1,wvd[0]+nvh); + + if (my_rank==0) + printf("Comparison of Awooinv_dble() and Aw_dble(): %.1e\n",sqrt(d)); + + random_vd(nvh,wvd[2],1.0); + assign_vd2vd(nvh,wvd[2],wvd[3]); + Aweo_dble(wvd[0],wvd[2]); + mulc_vadd_dble(nvh,wvd[3],wvd[2],zd); + mulc_vadd_dble(nvh,wvd[3],wvd[1],zd); + d=vnorm_square_dble(nvh,1,wvd[3])/vnorm_square_dble(nvh,1,wvd[1]); + + if (my_rank==0) + printf("Comparison of Aweo_dble() and Aw_dble(): %.1e\n",sqrt(d)); + + random_vd(nv,wvd[0],1.0); + Awhat_dble(wvd[0],wvd[1]); + Awoe_dble(wvd[0],wvd[2]); + Awooinv_dble(wvd[2],wvd[3]); + set_vd2zero(nvh,wvd[0]+nvh); + Aw_dble(wvd[0],wvd[2]); + Aweo_dble(wvd[3],wvd[2]); + Aweeinv_dble(wvd[2],wvd[3]); + + mulc_vadd_dble(nvh,wvd[3],wvd[1],zd); + d=vnorm_square_dble(nvh,1,wvd[3])/vnorm_square_dble(nvh,1,wvd[1]); + + + if (my_rank==0) + { + printf("Comparison of Aweeinv_dble(), Awooinv_dble(), \n"); + printf(" Awoe_dble(), Aweo_dble() and Awhat_dble(): %.1e\n\n", + sqrt(d)); + fflush(flog); + } + + ifail=check_bndAwop(); + error(ifail!=0,1,"main [check4.c]", + "Hopping terms Aoe,Aeo at the lattice boundaries do not vanish"); + error_chk(); + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check5.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check5.c new file mode 100644 index 0000000000000000000000000000000000000000..00debd8729fb236da30136876642f0bd8304afa6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check5.c @@ -0,0 +1,360 @@ + +/******************************************************************************* +* +* File check5.c +* +* Copyright (C) 2007, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the program set_ltl_modes(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "vflds.h" +#include "linalg.h" +#include "dirac.h" +#include "dfl.h" +#include "little.h" +#include "global.h" + + +static void random_basis(int Ns) +{ + int i; + spinor **ws; + + ws=reserve_ws(Ns); + + for (i=0;idev) + dev=d; + } + } + + return sqrt(dev); +} + + +static double check_Awvd(int Ns,int nvh) +{ + int i; + double d,dev; + complex_dble **vd,**wvd,z; + + vd=vdflds(); + wvd=reserve_wvd(2); + + dev=0.0; + z.re=-1.0; + z.im=0.0; + + for (i=0;idev) + dev=d; + } + + release_wvd(); + + return sqrt(dev); +} + + +static double check_ltl_matrix(int Ns,int nvh) +{ + int i,j,ie; + double dev; + complex_dble **vd,*amat,*bmat,*cmat,z; + + vd=vdflds(); + amat=ltl_matrix(); + bmat=amalloc(2*Ns*Ns*sizeof(*amat),ALIGN); + error(bmat==NULL,1,"check_ltl_matrix [check5.c]", + "Unable to allocate auxiliary arrays"); + cmat=bmat+Ns*Ns; + + for (i=0;idev) + dev=d; + } + + for (i=0;idev) + dev=d; + } + + release_wvd(); + + return sqrt(dev); +} + + +static double check_mds(int Ns,int nvh) +{ + int nv,k,l; + double d,dev; + complex **vs; + complex_dble **vd,**wvd; + spinor **mds,**ws; + + nv=2*nvh; + mds=reserve_ws(Ns); + ws=reserve_ws(1); + vs=vflds(); + dev=0.0; + + for (k=0;kdev) + dev=d; + } + + release_ws(); + release_ws(); + + vd=vdflds(); + wvd=reserve_wvd(1); + + for (k=0;kdev) + dev=d; + } + + release_wvd(); + + return sqrt(dev); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,ifail; + int bs[4],Ns,nb,nvh; + double phi[2],phi_prime[2]; + double mu,dev; + FILE *fin=NULL,*flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check5.log","w",stdout); + fin=freopen("check3.in","r",stdin); + + printf("\n"); + printf("Check of the program set_ltl_modes()\n"); + printf("------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + read_line("Ns","%d",&Ns); + fclose(fin); + + printf("bs = %d %d %d %d\n",bs[0],bs[1],bs[2],bs[3]); + printf("Ns = %d\n\n",Ns); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check5.c]", + "Syntax: check5 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + set_sw_parms(0.125); + set_dfl_parms(bs,Ns); + mu=0.0376; + + start_ranlux(0,123456); + geometry(); + + alloc_ws(Ns+1); + alloc_wvd(3); + + nb=VOLUME/(bs[0]*bs[1]*bs[2]*bs[3]); + nvh=Ns*(nb/2); + + random_ud(); + chs_ubnd(-1); + random_basis(Ns); + ifail=set_Awhat(mu); + error_root(ifail!=0,1,"main [check5.c]", + "Computation of the little Dirac operator failed"); + + if (my_rank==0) + printf("Maximal relative deviations found:\n\n"); + + dev=check_vd(Ns,nvh); + + if (my_rank==0) + printf("Orthonormality of vdflds: %.2e\n",dev); + + dev=check_Awvd(Ns,nvh); + + if (my_rank==0) + printf("Awhat*vdflds: %.2e\n",dev); + + dev=check_ltl_matrix(Ns,nvh); + + if (my_rank==0) + printf("Little-little matrix: %.2e\n\n",dev); + + dev=check_vflds(Ns,nvh); + + if (my_rank==0) + printf("Single-precision fields: %.2e\n",dev); + + dev=check_mds(Ns,nvh); + error_chk(); + + if (my_rank==0) + { + printf("Global deflation modes: %.2e\n\n",dev); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/time1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/time1.c new file mode 100644 index 0000000000000000000000000000000000000000..92901785e1f9a894790a9bf12fcdb11ca936386c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/time1.c @@ -0,0 +1,235 @@ + +/******************************************************************************* +* +* File time1.c +* +* Copyright (C) 2007, 2008, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of Awhat(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "vflds.h" +#include "linalg.h" +#include "dirac.h" +#include "dfl.h" +#include "little.h" +#include "global.h" + + +static void random_basis(int Ns) +{ + int i; + spinor **ws; + + ws=reserve_ws(Ns); + + for (i=0;i]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + set_sw_parms(0.125); + set_dfl_parms(bs,Ns); + mu=0.0376; + + nb=VOLUME/(bs[0]*bs[1]*bs[2]*bs[3]); + nbb=2*(FACE0/(bs[1]*bs[2]*bs[3])+ + FACE1/(bs[0]*bs[2]*bs[3])+ + FACE2/(bs[0]*bs[1]*bs[3])+ + FACE3/(bs[0]*bs[1]*bs[2])); + nv=Ns*nb; + + start_ranlux(0,123456); + geometry(); + + alloc_ws(Ns); + alloc_wvd(2); + random_ud(); + chs_ubnd(-1); + random_basis(Ns); + + ifail=set_Awhat(mu); + error(ifail!=0,1,"main [time1.c]","Inversion of Aee or Aoo failed"); + + if (my_rank==0) + { + printf("Number of points = %d\n",VOLUME); + printf("Number of blocks = %d\n",nb); + printf("Number of points/block = %d\n",bs[0]*bs[1]*bs[2]*bs[3]); + printf("Vector field size = %.2f KB\n", + (double)(sizeof(complex)*nv)*1.0e-3); + printf("Awhat array size = %.2f MB\n\n", + (double)(sizeof(complex)*8*Ns*nv)*1.0e-6); + fflush(flog); + } + + nflds=(int)(1.0e6/(double)(sizeof(complex)*nv)); + if ((nflds%2)!=0) + nflds+=1; + if (nflds==0) + nflds=2; + + alloc_wv(nflds); + wv=reserve_wv(nflds); + + for (i=0;i1) + { + nt/=2; + if (nt==0) + nt=1; + wdt=0.0; + + while (wdt<5.0) + { + for (i=0;i +#include +#include +#include "utils.h" +#include "extras.h" + + +int main(void) +{ + double x,y; + + printf("\n"); + printf("Modified Bessel function I0(x) [program i0m()]\n"); + printf("----------------------------------------------\n\n"); + + printf("Print selected values:\n\n"); + + for (;;) + { + printf("Specify x: "); + + if (scanf("%lf",&x)==1) + { + y=i0m(x); + printf("x = %.4e, exp(-x)*I0(x) = %.15e\n\n",x,y); + } + else + { + printf("No value specified, program stopped\n\n"); + break; + } + } + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..d32a6fa775381b91c3ff77dffc20593c1deac355 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/INDEX @@ -0,0 +1,7 @@ + +Generic functions for MD force calculations + +check1 Check of det2xt and prod2xt + +check2 Check of prod2xv + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..bad6db290fc42f20fb14ec8e729c54130c938141 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/Makefile @@ -0,0 +1,126 @@ +################################################################################ +# +# Makefile to compile and link C programs +# +# Version valid for Linux machines +# +# "make" compiles and links the specified main programs and modules +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files created by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and required modules + +MAIN = check1 check2 + +RANDOM = ranlxs ranlxd gauss + +UTILS = utils + +LINALG = cmatrix cmatrix_dble + +FORCES = frcfcts + +SW_TERM = pauli pauli_dble + +SU3FCTS = random_su3 + +MODULES = $(RANDOM) $(UTILS) $(LINALG) $(FORCES) $(SW_TERM) $(SU3FCTS) + + +# search path for modules + +MDIR = ../../../modules + +VPATH = $(MDIR)/nompi/extras:$(MDIR)/nompi/utils:$(MDIR)/linalg:\ + $(MDIR)/random:$(MDIR)/frcfcts:$(MDIR)/sw_term:\ + $(MDIR)/su3fcts:$(MDIR)/forces + + +# additional include directories + +INCPATH = ../../../include/nompi ../../../include + + +# additional libraries to be included + +LIBS = m + +LIBPATH = + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 + + +############################## do not change ################################### + +SHELL=/bin/bash + +CC=$(GCC) + +PGMS= $(MAIN) $(MODULES) + +INCDIRS = $(addprefix -I,$(INCPATH)) + +OBJECTS = $(addsuffix .o,$(MODULES)) + +LDFLAGS = $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(CC) -MM -ansi $(INCDIRS) $< -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(INCDIRS) -o $@ + + +# rule to link object files + +$(MAIN): %: %.o $(OBJECTS) Makefile + $(CC) $< $(OBJECTS) $(CFLAGS) $(LDFLAGS) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables and old error log file + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..e12cabc3f3d89890f2ec394d2f380dd1890b9767 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/check1.c @@ -0,0 +1,323 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2005, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of det2xt and prod2xt +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "su3fcts.h" +#include "sw_term.h" +#include "forces.h" + +typedef union +{ + spinor_dble s; + weyl_dble w[2]; + double r[24]; +} spin_t; + +typedef union +{ + su3_vector_dble v; + double r[6]; +} vec_t; + +static int pln[6][2]={{0,1},{0,2},{0,3},{2,3},{3,1},{1,2}}; +static const su3_vector_dble vd0={{0.0}}; +static const spinor_dble sd0={{{0.0}}}; + +static su3_dble Q ALIGNED16; +static spin_t s1,s2,s3,s4 ALIGNED16; +static pauli_dble m[2] ALIGNED16; + + +static su3_vector_dble mul_cplx(complex_dble z,su3_vector_dble s) +{ + su3_vector_dble r; + + r.c1.re=z.re*s.c1.re-z.im*s.c1.im; + r.c1.im=z.im*s.c1.re+z.re*s.c1.im; + r.c2.re=z.re*s.c2.re-z.im*s.c2.im; + r.c2.im=z.im*s.c2.re+z.re*s.c2.im; + r.c3.re=z.re*s.c3.re-z.im*s.c3.im; + r.c3.im=z.im*s.c3.re+z.re*s.c3.im; + + return r; +} + + +static spinor_dble mul_gamma(int mu,spinor_dble s) +{ + spinor_dble r; + complex_dble i,m_i,m_1; + + i.re=0.0; + i.im=1.0; + + m_i.re=0.0; + m_i.im=-1.0; + + m_1.re=-1.0; + m_1.im=0.0; + + if (mu==0) + { + r.c1=mul_cplx(m_1,s.c3); + r.c2=mul_cplx(m_1,s.c4); + r.c3=mul_cplx(m_1,s.c1); + r.c4=mul_cplx(m_1,s.c2); + } + else if (mu==1) + { + r.c1=mul_cplx(m_i,s.c4); + r.c2=mul_cplx(m_i,s.c3); + r.c3=mul_cplx(i,s.c2); + r.c4=mul_cplx(i,s.c1); + } + else if (mu==2) + { + r.c1=mul_cplx(m_1,s.c4); + r.c2=s.c3; + r.c3=s.c2; + r.c4=mul_cplx(m_1,s.c1); + } + else if (mu==3) + { + r.c1=mul_cplx(m_i,s.c3); + r.c2=mul_cplx(i,s.c4); + r.c3=mul_cplx(i,s.c1); + r.c4=mul_cplx(m_i,s.c2); + } + else + { + r.c1=s.c1; + r.c2=s.c2; + r.c3=mul_cplx(m_1,s.c3); + r.c4=mul_cplx(m_1,s.c4); + } + + return r; +} + + +static spinor_dble mul_sigma(int mu,int nu,spinor_dble s) +{ + complex_dble z; + spinor_dble r1,r2; + + r1=mul_gamma(nu,s); + r1=mul_gamma(mu,r1); + + r2=mul_gamma(mu,s); + r2=mul_gamma(nu,r2); + + _vector_sub_assign(r1.c1,r2.c1); + _vector_sub_assign(r1.c2,r2.c2); + _vector_sub_assign(r1.c3,r2.c3); + _vector_sub_assign(r1.c4,r2.c4); + + z.re=0.0; + z.im=0.5; + _vector_mulc(r2.c1,z,r1.c1); + _vector_mulc(r2.c2,z,r1.c2); + _vector_mulc(r2.c3,z,r1.c3); + _vector_mulc(r2.c4,z,r1.c4); + + return r2; +} + + +static spinor_dble mul_Fhat(su3_dble Q,spinor_dble s) +{ + su3_dble F; + spinor_dble r; + + F.c11.re=0.0; + F.c11.im=0.25*Q.c11.im; + F.c22.re=0.0; + F.c22.im=0.25*Q.c22.im; + F.c33.re=0.0; + F.c33.im=0.25*Q.c33.im; + + F.c12.re=0.125*(Q.c12.re-Q.c21.re); + F.c12.im=0.125*(Q.c12.im+Q.c21.im); + F.c21.re=-F.c12.re; + F.c21.im=F.c12.im; + + F.c13.re=0.125*(Q.c13.re-Q.c31.re); + F.c13.im=0.125*(Q.c13.im+Q.c31.im); + F.c31.re=-F.c13.re; + F.c31.im=F.c13.im; + + F.c23.re=0.125*(Q.c23.re-Q.c32.re); + F.c23.im=0.125*(Q.c23.im+Q.c32.im); + F.c32.re=-F.c23.re; + F.c32.im=F.c23.im; + + _su3_multiply(r.c1,F,s.c1); + _su3_multiply(r.c2,F,s.c2); + _su3_multiply(r.c3,F,s.c3); + _su3_multiply(r.c4,F,s.c4); + + return r; +} + + +static su3_vector_dble mul_X(u3_alg_dble X,su3_vector_dble s) +{ + su3_dble M; + su3_vector_dble r; + + M.c11.re=0.0; + M.c11.im=X.c1; + M.c22.re=0.0; + M.c22.im=X.c2; + M.c33.re=0.0; + M.c33.im=X.c3; + + M.c12.re=X.c4; + M.c12.im=X.c5; + M.c21.re=-X.c4; + M.c21.im=X.c5; + + M.c13.re=X.c6; + M.c13.im=X.c7; + M.c31.re=-X.c6; + M.c31.im=X.c7; + + M.c23.re=X.c8; + M.c23.im=X.c9; + M.c32.re=-X.c8; + M.c32.im=X.c9; + + _su3_multiply(r,M,s); + + return r; +} + + +int main(void) +{ + int n,mu,nu,i; + complex_dble z; + vec_t v1,v2,v3; + u3_alg_dble X[6]; + + printf("\n"); + printf("Check of det2xt and prod2xt\n"); + printf("---------------------------\n\n"); + + rlxd_init(1,23456); + + ranlxd(v1.r,6); + ranlxd(v2.r,6); + ranlxd(v3.r,6); + + ranlxd(s1.r,24); + ranlxd(s2.r,24); + ranlxd(s3.r,24); + ranlxd(s4.r,24); + + ranlxd(m[0].u,36); + ranlxd(m[1].u,36); + + det2xt(m,X); + + printf("det2xt:\n"); + + for (n=0;n<6;n++) + { + mu=pln[n][0]; + nu=pln[n][1]; + + random_su3_dble(&Q); + z.im=0.0; + + for (i=0;i<12;i++) + { + s1.s=sd0; + s1.r[2*i]=1.0; + + mul_pauli_dble(0.0,m,s1.w,s2.w); + mul_pauli_dble(0.0,m+1,s1.w+1,s2.w+1); + s1.s=mul_sigma(mu,nu,s2.s); + s2.s=mul_Fhat(Q,s1.s); + + z.im-=s2.r[2*i+1]; + } + + z.re=0.0; + + for (i=0;i<3;i++) + { + v1.v=vd0; + v1.r[2*i]=1.0; + + v2.v=mul_X(X[n],v1.v); + _su3_multiply(v3.v,Q,v2.v); + + z.re+=v3.r[2*i]; + } + + printf("mu,nu = %d,%d: %.2e\n", + mu,nu,fabs(2.0*z.re-8.0*z.im)); + } + + ranlxd(s1.r,24); + ranlxd(s2.r,24); + + prod2xt(&s1.s,&s2.s,X); + + printf("\n"); + printf("prod2xt:\n"); + + for (n=0;n<6;n++) + { + mu=pln[n][0]; + nu=pln[n][1]; + + random_su3_dble(&Q); + z.im=0.0; + + s3.s=mul_sigma(mu,nu,s2.s); + s4.s=mul_gamma(5,s3.s); + s3.s=mul_Fhat(Q,s4.s); + + z.im =_vector_prod_im(s1.s.c1,s3.s.c1); + z.im+=_vector_prod_im(s1.s.c2,s3.s.c2); + z.im+=_vector_prod_im(s1.s.c3,s3.s.c3); + z.im+=_vector_prod_im(s1.s.c4,s3.s.c4); + + z.re=0.0; + + for (i=0;i<3;i++) + { + v1.v=vd0; + v1.r[2*i]=1.0; + + v2.v=mul_X(X[n],v1.v); + _su3_multiply(v3.v,Q,v2.v); + + z.re+=v3.r[2*i]; + } + + printf("mu,nu = %d,%d: %.2e\n", + mu,nu,fabs(2.0*z.re+16.0*z.im)); + } + + printf("\n"); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..4bedffaeca45313fad99022e8d89d19dc3e30a37 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/check2.c @@ -0,0 +1,206 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2005, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of prod2xv +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "su3fcts.h" +#include "sw_term.h" +#include "forces.h" + +typedef union +{ + su3_dble u; + complex_dble c[9]; +} umat_t; + +static const su3_dble ud0={{0.0}}; +static su3_dble u,v ALIGNED16; +static spinor_dble rx,ry,sx,sy,sw ALIGNED16; + +#define _re(z,w) ((z).re*(w).re+(z).im*(w).im) +#define _im(z,w) ((z).im*(w).re-(z).re*(w).im) + + +static su3_vector_dble mul_cplx(complex_dble z,su3_vector_dble s) +{ + su3_vector_dble r; + + r.c1.re=z.re*s.c1.re-z.im*s.c1.im; + r.c1.im=z.im*s.c1.re+z.re*s.c1.im; + r.c2.re=z.re*s.c2.re-z.im*s.c2.im; + r.c2.im=z.im*s.c2.re+z.re*s.c2.im; + r.c3.re=z.re*s.c3.re-z.im*s.c3.im; + r.c3.im=z.im*s.c3.re+z.re*s.c3.im; + + return r; +} + + +static spinor_dble mul_gamma(int mu,spinor_dble s) +{ + spinor_dble r; + complex_dble i,m_i,m_1; + + i.re=0.0; + i.im=1.0; + + m_i.re=0.0; + m_i.im=-1.0; + + m_1.re=-1.0; + m_1.im=0.0; + + if (mu==0) + { + r.c1=mul_cplx(m_1,s.c3); + r.c2=mul_cplx(m_1,s.c4); + r.c3=mul_cplx(m_1,s.c1); + r.c4=mul_cplx(m_1,s.c2); + } + else if (mu==1) + { + r.c1=mul_cplx(m_i,s.c4); + r.c2=mul_cplx(m_i,s.c3); + r.c3=mul_cplx(i,s.c2); + r.c4=mul_cplx(i,s.c1); + } + else if (mu==2) + { + r.c1=mul_cplx(m_1,s.c4); + r.c2=s.c3; + r.c3=s.c2; + r.c4=mul_cplx(m_1,s.c1); + } + else if (mu==3) + { + r.c1=mul_cplx(m_i,s.c3); + r.c2=mul_cplx(i,s.c4); + r.c3=mul_cplx(i,s.c1); + r.c4=mul_cplx(m_i,s.c2); + } + else + { + r.c1=s.c1; + r.c2=s.c2; + r.c3=mul_cplx(m_1,s.c3); + r.c4=mul_cplx(m_1,s.c4); + } + + return r; +} + + +static void add_tensor(su3_vector_dble *r,su3_vector_dble *s,su3_dble *p) +{ + (*p).c11.re+=_re((*r).c1,(*s).c1); + (*p).c11.im+=_im((*r).c1,(*s).c1); + (*p).c12.re+=_re((*r).c1,(*s).c2); + (*p).c12.im+=_im((*r).c1,(*s).c2); + (*p).c13.re+=_re((*r).c1,(*s).c3); + (*p).c13.im+=_im((*r).c1,(*s).c3); + + (*p).c21.re+=_re((*r).c2,(*s).c1); + (*p).c21.im+=_im((*r).c2,(*s).c1); + (*p).c22.re+=_re((*r).c2,(*s).c2); + (*p).c22.im+=_im((*r).c2,(*s).c2); + (*p).c23.re+=_re((*r).c2,(*s).c3); + (*p).c23.im+=_im((*r).c2,(*s).c3); + + (*p).c31.re+=_re((*r).c3,(*s).c1); + (*p).c31.im+=_im((*r).c3,(*s).c1); + (*p).c32.re+=_re((*r).c3,(*s).c2); + (*p).c32.im+=_im((*r).c3,(*s).c2); + (*p).c33.re+=_re((*r).c3,(*s).c3); + (*p).c33.im+=_im((*r).c3,(*s).c3); +} + + +static double max_dev(su3_dble *u,su3_dble *v) +{ + int i; + double nrm,dev; + umat_t uu,uv; + + uu.u=(*u); + uv.u=(*v); + + nrm=0.0; + dev=0.0; + + for (i=0;i<9;i++) + { + nrm+=uu.c[i].re*uu.c[i].re+uu.c[i].im*uu.c[i].im; + + dev+=(uu.c[i].re-uv.c[i].re)*(uu.c[i].re-uv.c[i].re)+ + (uu.c[i].im-uv.c[i].im)*(uu.c[i].im-uv.c[i].im); + } + + return sqrt(dev/nrm); +} + + +int main(void) +{ + int mu; + + printf("\n"); + printf("Check of prod2xv\n"); + printf("-----------------\n\n"); + + rlxd_init(1,567); + + gauss_dble((double*)(&rx),24); + gauss_dble((double*)(&ry),24); + gauss_dble((double*)(&sx),24); + gauss_dble((double*)(&sy),24); + + for (mu=0;mu<4;mu++) + { + prod2xv[mu](&rx,&ry,&sx,&sy,&u); + v=ud0; + + sw=mul_gamma(mu,ry); + _vector_sub(sw.c1,ry.c1,sw.c1); + _vector_sub(sw.c2,ry.c2,sw.c2); + _vector_sub(sw.c3,ry.c3,sw.c3); + _vector_sub(sw.c4,ry.c4,sw.c4); + sw=mul_gamma(5,sw); + + add_tensor(&sw.c1,&sx.c1,&v); + add_tensor(&sw.c2,&sx.c2,&v); + add_tensor(&sw.c3,&sx.c3,&v); + add_tensor(&sw.c4,&sx.c4,&v); + + sw=mul_gamma(mu,sy); + _vector_sub(sw.c1,sy.c1,sw.c1); + _vector_sub(sw.c2,sy.c2,sw.c2); + _vector_sub(sw.c3,sy.c3,sw.c3); + _vector_sub(sw.c4,sy.c4,sw.c4); + sw=mul_gamma(5,sw); + + add_tensor(&sw.c1,&rx.c1,&v); + add_tensor(&sw.c2,&rx.c2,&v); + add_tensor(&sw.c3,&rx.c3,&v); + add_tensor(&sw.c4,&rx.c4,&v); + + printf("mu = %d: %.2e\n",mu,max_dev(&u,&v)); + } + + printf("\n"); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..b43d50153001c466be2f329a622d5f5a0394f1ec --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/INDEX @@ -0,0 +1,10 @@ + +Complex matrix functions + +check1 Check of cmat_vec, cmat_add, ... + +check2 Check of cmat_vec_dble, cmat_add_dble, ... + +time1 Timing of cmat_vec and cmat_mul + +time2 Timing of cmat_vec_dble, cmat_mul_dble and cmat_inv_dble diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..9337e77ae9ea19bf857cd9dc44bdf3376f61b9e5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/Makefile @@ -0,0 +1,119 @@ +################################################################################ +# +# Makefile to compile and link C programs +# +# Version valid for Linux machines +# +# "make" compiles and links the specified main programs and modules +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files created by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and required modules + +MAIN = check1 check2 time1 time2 + +RANDOM = ranlxs ranlxd gauss + +UTILS = utils + +LINALG = cmatrix cmatrix_dble + +MODULES = $(RANDOM) $(UTILS) $(LINALG) + + +# search path for modules + +MDIR = ../../../modules + +VPATH = $(MDIR)/nompi/utils:$(MDIR)/random:$(MDIR)/linalg + + +# additional include directories + +INCPATH = ../../../include/nompi ../../../include + + +# additional libraries to be included + +LIBS = m + +LIBPATH = + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing -fno-inline \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + +# -Dx64 + +############################## do not change ################################### + +SHELL=/bin/bash + +CC=$(GCC) + +PGMS= $(MAIN) $(MODULES) + +INCDIRS = $(addprefix -I,$(INCPATH)) + +OBJECTS = $(addsuffix .o,$(MODULES)) + +LDFLAGS = $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(CC) -MM -ansi $(INCDIRS) $< -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(INCDIRS) -o $@ + + +# rule to link object files + +$(MAIN): %: %.o $(OBJECTS) Makefile + $(CC) $< $(OBJECTS) $(CFLAGS) $(LDFLAGS) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables and old error log file + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..fedca05691fcef6e4f74ef6b9930808022fb8774 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/check1.c @@ -0,0 +1,387 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2007, 2009, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of cmat_vec_dble, cmat_add_dble, ... +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "linalg.h" + +#define NMAX 32 + +#define cadd(u,v,w) \ + (u).re=(v).re+(w).re;\ + (u).im=(v).im+(w).im + +#define csub(u,v,w) \ + (u).re=(v).re-(w).re;\ + (u).im=(v).im-(w).im + +#define cmul(u,v,w) \ + (u).re=(v).re*(w).re-(v).im*(w).im;\ + (u).im=(v).re*(w).im+(v).im*(w).re + +#define cmul_assign(u,v,w) \ + (u).re+=((v).re*(w).re-(v).im*(w).im);\ + (u).im+=((v).re*(w).im+(v).im*(w).re) + + +static void mvec(int n,complex *a,complex *v,complex *w) +{ + int i,j; + complex z; + + for (i=0;idmax) + dmax=d; + } + + return dmax; +} + + +static float mdev(int n,complex *a,complex *b) +{ + int i,j; + float d,dmax; + + dmax=0.0f; + + for (i=0;idmax) + dmax=d; + } + } + + return dmax; +} + + +static void rvec(int n,complex *v) +{ + gauss((float*)(v),2*n); +} + + +static void rmat(int n,complex *a) +{ + int i,j; + float r; + + r=0.1f/(float)(n*n); + + gauss((float*)(a),2*n*n); + + for (i=0;id1) + d1=d; + + error((mdev(n,a1,a2)!=0.0f)||(vdev(n,v1,v2)!=0.0f),1,"main [check1.c]", + "cmat_vec: input values have changed"); + + rvec(n,v1); + rvec(n,w1); + rmat(n,a1); + vec2vec(n,v1,v2); + vec2vec(n,w1,w2); + mat2mat(n,a1,a2); + + cmat_vec_assign(n,a1,v1,w1); + mvec_assign(n,a2,v2,w2); + + d=vdev(n,w1,w2); + if (d>d1) + d1=d; + + error((mdev(n,a1,a2)!=0.0f)||(vdev(n,v1,v2)!=0.0f),1,"main [check1.c]", + "cmat_vec_assign: input values have changed"); + + rmat(n,a1); + rmat(n,b1); + rmat(n,c1); + mat2mat(n,a1,a2); + mat2mat(n,b1,b2); + rmat(n,c2); + + cmat_add(n,a1,b1,c1); + madd(n,a2,b2,c2); + + d=mdev(n,c1,c2); + if (d>d2) + d2=d; + + error((mdev(n,a1,a2)!=0.0f)||(mdev(n,b1,b2)!=0.0f),1,"main [check1.c]", + "cmat_add: input values have changed"); + + rmat(n,a1); + rmat(n,b1); + rmat(n,c1); + mat2mat(n,a1,a2); + mat2mat(n,b1,b2); + rmat(n,c2); + + cmat_sub(n,a1,b1,c1); + msub(n,a2,b2,c2); + + d=mdev(n,c1,c2); + if (d>d3) + d3=d; + + error((mdev(n,a1,a2)!=0.0f)||(mdev(n,b1,b2)!=0.0f),1,"main [check1.c]", + "cmat_sub: input values have changed"); + + rmat(n,a1); + rmat(n,b1); + rmat(n,c1); + mat2mat(n,a1,a2); + mat2mat(n,b1,b2); + rmat(n,c2); + + cmat_mul(n,a1,b1,c1); + mmul(n,a2,b2,c2); + + d=mdev(n,c1,c2); + if (d>d4) + d4=d; + + error((mdev(n,a1,a2)!=0.0f)||(mdev(n,b1,b2)!=0.0f),1,"main [check1.c]", + "cmat_mul: input values have changed"); + + rmat(n,a1); + rmat(n,b1); + mat2mat(n,a1,a2); + rmat(n,b2); + + cmat_dag(n,a1,b1); + mdag(n,a2,b2); + + d=mdev(n,b1,b2); + if (d>d5) + d5=d; + + error(mdev(n,a1,a2)!=0.0f,1,"main [check1.c]", + "cmat_dag: input values have changed"); + } + + printf("Consider matrices of size up to %dx%d\n\n",NMAX,NMAX); + + printf("The maximal observed deviations are:\n\n"); + printf("cmat_vec: %.1e\n",d1); + printf("cmat_add: %.1e\n",d2); + printf("cmat_sub: %.1e\n",d3); + printf("cmat_mul: %.1e\n",d4); + printf("cmat_dag: %.1e\n\n",d5); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..d8e249d9573a323d7d1284127cfa585d2b30d654 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/check2.c @@ -0,0 +1,436 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2007, 2009, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of cmat_vec_dble, cmat_add_dble, ... +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "linalg.h" + +#define NMAX 32 + +#define cadd(u,v,w) \ + (u).re=(v).re+(w).re;\ + (u).im=(v).im+(w).im + +#define csub(u,v,w) \ + (u).re=(v).re-(w).re;\ + (u).im=(v).im-(w).im + +#define cmul(u,v,w) \ + (u).re=(v).re*(w).re-(v).im*(w).im;\ + (u).im=(v).re*(w).im+(v).im*(w).re + +#define cmul_assign(u,v,w) \ + (u).re+=((v).re*(w).re-(v).im*(w).im);\ + (u).im+=((v).re*(w).im+(v).im*(w).re) + + +static void mvec(int n,complex_dble *a,complex_dble *v,complex_dble *w) +{ + int i,j; + complex_dble z; + + for (i=0;idmax) + dmax=d; + } + + return dmax; +} + + +static double mdev(int n,complex_dble *a,complex_dble *b) +{ + int i,j; + double d,dmax; + + dmax=0.0; + + for (i=0;idmax) + dmax=d; + } + } + + return dmax; +} + + +static void rvec(int n,complex_dble *v) +{ + gauss_dble((double*)(v),2*n); +} + + +static void rmat(int n,complex_dble *a) +{ + int i,j; + double r; + + r=0.1/(double)(n*n); + + gauss_dble((double*)(a),2*n*n); + + for (i=0;id1) + d1=d; + + error((mdev(n,a1,a2)!=0.0)||(vdev(n,v1,v2)!=0.0),1,"main [check2.c]", + "cmat_vec_dble: input values have changed"); + + rvec(n,v1); + rvec(n,w1); + rmat(n,a1); + vec2vec(n,v1,v2); + vec2vec(n,w1,w2); + mat2mat(n,a1,a2); + + cmat_vec_assign_dble(n,a1,v1,w1); + mvec_assign(n,a2,v2,w2); + + d=vdev(n,w1,w2); + if (d>d1) + d1=d; + + error((mdev(n,a1,a2)!=0.0)||(vdev(n,v1,v2)!=0.0),1,"main [check2.c]", + "cmat_vec_assign_dble: input values have changed"); + + rmat(n,a1); + rmat(n,b1); + rmat(n,c1); + mat2mat(n,a1,a2); + mat2mat(n,b1,b2); + rmat(n,c2); + + cmat_add_dble(n,a1,b1,c1); + madd(n,a2,b2,c2); + + d=mdev(n,c1,c2); + if (d>d2) + d2=d; + + error((mdev(n,a1,a2)!=0.0)||(mdev(n,b1,b2)!=0.0),1,"main [check2.c]", + "cmat_add_dble: input values have changed"); + + rmat(n,a1); + rmat(n,b1); + rmat(n,c1); + mat2mat(n,a1,a2); + mat2mat(n,b1,b2); + rmat(n,c2); + + cmat_sub_dble(n,a1,b1,c1); + msub(n,a2,b2,c2); + + d=mdev(n,c1,c2); + if (d>d3) + d3=d; + + error((mdev(n,a1,a2)!=0.0)||(mdev(n,b1,b2)!=0.0),1,"main [check2.c]", + "cmat_sub_dble: input values have changed"); + + rmat(n,a1); + rmat(n,b1); + rmat(n,c1); + mat2mat(n,a1,a2); + mat2mat(n,b1,b2); + rmat(n,c2); + + cmat_mul_dble(n,a1,b1,c1); + mmul(n,a2,b2,c2); + + d=mdev(n,c1,c2); + if (d>d4) + d4=d; + + error((mdev(n,a1,a2)!=0.0)||(mdev(n,b1,b2)!=0.0),1,"main [check2.c]", + "cmat_mul_dble: input values have changed"); + + rmat(n,a1); + rmat(n,b1); + mat2mat(n,a1,a2); + rmat(n,b2); + + cmat_dag_dble(n,a1,b1); + mdag(n,a2,b2); + + d=mdev(n,b1,b2); + if (d>d5) + d5=d; + + error(mdev(n,a1,a2)!=0.0,1,"main [check2.c]", + "cmat_dag_dble: input values have changed"); + + rmat(n,a1); + rmat(n,b1); + mat2mat(n,a1,a2); + rmat(n,b2); + rmat(n,c2); + + ie=cmat_inv_dble(n,a1,b1,&k1); + mmul(n,a2,b1,b2); + mmul(n,a2,b2,c2); + + d=mdev(n,a2,c2); + if (d>d6) + d6=d; + + if (k1>kmax) + kmax=k1; + + k2=fnorm(n,a1)*fnorm(n,b1); + d=fabs(k2/k1-1.0); + if (d>d7) + d7=d; + + error(ie!=0,1,"main [check2.c]", + "cmat_inv_dble: singular matrix encountered"); + + error(mdev(n,a1,a2)!=0.0,1,"main [check2.c]", + "cmat_inv_dble: input values have changed"); + } + + printf("Consider matrices of size up to %dx%d\n\n",NMAX,NMAX); + + printf("The maximal observed deviations are:\n\n"); + printf("cmat_vec_dble: %.1e\n",d1); + printf("cmat_add_dble: %.1e\n",d2); + printf("cmat_sub_dble: %.1e\n",d3); + printf("cmat_mul_dble: %.1e\n",d4); + printf("cmat_dag_dble: %.1e\n",d5); + printf("cmat_inv_dble: %.1e, condition number: max=%.1e, dev=%.1e\n\n", + d6,kmax,d7); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/time1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/time1.c new file mode 100644 index 0000000000000000000000000000000000000000..5d4880a9cfd04e965d2d014f5b9dad500d6c741d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/time1.c @@ -0,0 +1,100 @@ + +/******************************************************************************* +* +* File time1.c +* +* Copyright (C) 2007, 2009, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of cmat_vec and cmat_mul +* +*******************************************************************************/ + +#include +#include +#include +#include +#include "random.h" +#include "su3.h" +#include "utils.h" +#include "linalg.h" + + +int main(void) +{ + int ir,nm,n,count; + double t1,t2,dt; + complex *a,*b,*c,*v,*w; + + printf("\n"); + printf("Timing of cmat_vec and cmat_mul\n"); + printf("-------------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX instructions\n\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + printf("Measurement made with all data in cache\n\n"); + + printf("Matrix size: "); + ir=scanf(" %d",&nm); + + error((ir!=1)||(nm<1),1,"main [time1.c]", + "Read error or improper matrix size"); + + a=amalloc((3*nm*nm+2*nm)*sizeof(*a),4); + error(a==NULL,1,"main [time1.c]","Unable to allocate auxiliary arrays"); + + rlxs_init(0,23456); + ranlxs((float*)(a),6*nm*nm+4*nm); + + b=a+nm*nm; + c=b+nm*nm; + v=c+nm*nm; + w=v+nm; + + n=(int)(1.0e7)/(nm*nm); + dt=0.0; + + while (dt<2.0) + { + t1=(double)clock(); + for (count=0;count +#include +#include +#include +#include "random.h" +#include "su3.h" +#include "utils.h" +#include "linalg.h" + + +int main(void) +{ + int ie,ir,nm,n,count; + double k,t1,t2,dt; + complex_dble *a,*b,*c,*v,*w; + + printf("\n"); + printf("Timing of cmat_vec_dble, cmat_mul_dble and cmat_inv_dble\n"); + printf("--------------------------------------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX instructions\n\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + printf("Measurement made with all data in cache\n\n"); + + printf("Matrix size: "); + ir=scanf(" %d",&nm); + + error((ir!=1)||(nm<1),1,"main [time2.c]", + "Read error or improper matrix size"); + + a=amalloc((3*nm*nm+2*nm)*sizeof(*a),6); + error(a==NULL,1,"main [time2.c]","Unable to allocate auxiliary arrays"); + + rlxd_init(1,23456); + ranlxd((double*)(a),6*nm*nm+4*nm); + + b=a+nm*nm; + c=b+nm*nm; + v=c+nm*nm; + w=v+nm; + + n=(int)(1.0e7)/(nm*nm); + dt=0.0; + + while (dt<2.0) + { + t1=(double)clock(); + for (count=0;count.dat produced by the + simulation programs qcd1 and ym1. + +read2 Reads the data files produced by the measurement + program ms1. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/main/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/main/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..2ee788070bcfa33bdc7e677b20a87bf356d2cfa5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/main/Makefile @@ -0,0 +1,115 @@ +################################################################################ +# +# Makefile to compile and link C programs +# +# Version valid for Linux machines +# +# "make" compiles and links the specified main programs and modules +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files created by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and required modules + +MAIN = read1 read2 + +EXTRAS = stat + +UTILS = utils mutils endian + +MODULES = $(EXTRAS) $(UTILS) + + +# search path for modules + +MDIR = ../../../modules + +VPATH = .:$(MDIR)/nompi/extras:$(MDIR)/nompi/utils + + +# additional include directories + +INCPATH = ../../../include/nompi + + +# additional libraries to be included + +LIBS = m + +LIBPATH = + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fno-strict-aliasing \ + -Wall -Wstrict-prototypes -Werror -O + + +############################## do not change ################################### + +SHELL=/bin/bash + +CC=$(GCC) + +PGMS= $(MAIN) $(MODULES) + +INCDIRS = $(addprefix -I,$(INCPATH)) + +OBJECTS = $(addsuffix .o,$(MODULES)) + +LDFLAGS = $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(CC) -MM -ansi $(INCDIRS) $< -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(INCDIRS) -o $@ + + +# rule to link object files + +$(MAIN): %: %.o $(OBJECTS) Makefile + $(CC) $< $(OBJECTS) $(CFLAGS) $(LDFLAGS) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables and old error log file + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/main/read1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/main/read1.c new file mode 100644 index 0000000000000000000000000000000000000000..7f23a163c52ec4de6c5022ed06051b16dc253cb5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/main/read1.c @@ -0,0 +1,328 @@ + +/******************************************************************************* +* +* File read1.c +* +* Copyright (C) 2010-2014 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Reads and evaluates data from the *.dat files created by the programs qcd1 +* and ym1. The file to be read has to be specified on the command line. +* +* This program writes the history of the MD energy deficit dH, the acceptance +* flag iac and the average plaquette to the file .run1.dat in the +* plots directory. In addition, some information about the distribution of dH +* and the integrated autocorrelation time of the plaquette are printed to +* stdout. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "utils.h" +#include "extras.h" + +typedef struct +{ + int nt,iac; + double dH,avpl; +} dat_t; + +static int nms,nfirst,nlast,neff; +static dat_t *adat; + + +static int read_dat(int n,dat_t *ndat,FILE *fin) +{ + int i,ir,ic,endian; + stdint_t istd[2]; + double dstd[2]; + + endian=endianness(); + ic=0; + + for (i=0;iamx); + + return (double)(ic)/(double)(n); +} + + +static double f(int nx,double x[]) +{ + return x[0]; +} + + +static void print_plot(char *fin) +{ + int n,ims; + char base[NAME_SIZE],plt_file[NAME_SIZE],*p; + dat_t *ndat; + FILE *fout; + + p=strstr(fin,".dat"); + error(p==NULL,1,"print_plot [read1.c]","Unexpected data file name"); + n=p-fin; + + p=strrchr(fin,'/'); + if (p==NULL) + p=fin; + else + p+=1; + n-=(p-fin); + + error(n>=NAME_SIZE,1,"print_plot [read1.c]","File name is too long"); + strncpy(base,p,n); + base[n]='\0'; + + error(name_size("plots/%s.run1.dat",base)>=NAME_SIZE,1, + "print_plot [read1.c]","File name is too long"); + sprintf(plt_file,"plots/%s.run1.dat",base); + fout=fopen(plt_file,"w"); + error(fout==NULL,1,"print_plot [read1.c]", + "Unable to open output file"); + + fprintf(fout,"#\n"); + fprintf(fout,"# Data written by the program ym1 or qcd1\n"); + fprintf(fout,"# ---------------------------------------\n"); + fprintf(fout,"#\n"); + fprintf(fout,"# Number of measurements = %d\n",nms); + fprintf(fout,"#\n"); + fprintf(fout,"# nt: trajectory number\n"); + fprintf(fout,"# dH: MD energy deficit\n"); + fprintf(fout,"# iac: acceptance flag\n"); + fprintf(fout,"#\n"); + fprintf(fout,"# nt dH iac \n"); + fprintf(fout,"#\n"); + + ndat=adat; + + for (ims=0;ims"); + + printf("\n"); + printf("HMC simulation of QCD\n"); + printf("---------------------\n\n"); + + read_file(argv[1]); + select_range(); + + a=malloc(neff*sizeof(double)); + error(a==NULL,1,"main [read1.c]", + "Unable to allocate data array"); + + for (n=0;n = %.3f (%.3f)\n", + average(neff,a),sigma0(neff,a)); + + for (n=0;n0.0) + a[n]=exp(-adat[nfirst+n].dH); + else + a[n]=1.0; + } + + printf(" = %.3f (%.3f)\n", + average(neff,a),sigma0(neff,a)); + + for (n=0;n = %.3f (%.3f)\n\n", + average(neff,a),sigma0(neff,a)); + + for (n=0;n100) + printf("using the\nnumerically determined " + "autocorrelation function.\n\n"); + else + printf("by binning the\ndata and by calculating " + "the jackknife errors of the binned series.\n\n"); + + printf("The autocorrelation times are given in numbers of measurements\n" + "separated by %d trajectories.\n\n", + adat[nfirst+1].nt-adat[nfirst].nt); + + if (neff>=100) + abar=print_auto(neff,a); + else + abar=print_jack(1,neff,&a,f); + + printf(" = %1.6f\n\n",abar); + + print_plot(argv[1]); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/main/read2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/main/read2.c new file mode 100644 index 0000000000000000000000000000000000000000..badca6542645cf5b1e807b8e6a52371923076390 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/main/read2.c @@ -0,0 +1,606 @@ +/******************************************************************************* +* +* File read2.c +* +* Copyright (C) 2012-2014 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Reads and evaluates data from the data files created by the program ms1. +* The file to be read has to be specified on the command line. +* +* This program writes the history of the measured normalized reweighting +* factors to the file .run2.dat in the plots directory. The +* associated integrated autocorrelation times are estimated and printed +* to stdout. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "utils.h" +#include "extras.h" + +static struct +{ + int nrw; + int *nfct,*nsrc; +} file_head; + +static struct +{ + int nc; + double ***sqn,***lnr; +} data; + +static int endian; +static int first,last,step,nms; +static double ***avrw,***lnrw,*avtot,*lntot; + + +static void read_file_head(FILE *fdat) +{ + int nrw,*nfct,*nsrc; + int ir,ie,irw; + stdint_t istd[1]; + + ir=fread(istd,sizeof(stdint_t),1,fdat); + error(ir!=1,1,"read_file_head [read2.c]", + "Incorrect read count"); + + if (endian==BIG_ENDIAN) + bswap_int(1,istd); + + nrw=(int)(istd[0]); + error(nrw<1,1,"read_file_head [read2.c]", + "nrw is out of range"); + + nfct=malloc(2*nrw*sizeof(*nfct)); + error(nfct==NULL,1,"read_file_head [read2.c]", + "Unable to allocate data arrays"); + nsrc=nfct+nrw; + ie=0; + + for (irw=0;irwlst) + { + last=last-((last-lst)/step)*step; + if (last>lst) + last-=step; + } + + error((last=first)&&(nc<=last)&&(((nc-first)%step)==0)) + { + data2avrw(ims); + ims+=1; + } + } + + fclose(fdat); + error((ims!=nms)||(data.nc!=last),1,"read_file [read2.c]", + "Incorrect read count"); + + normalize_avrw(); +} + + +static double f(int nx,double x[]) +{ + return x[0]; +} + + +static void print_plot(char *fin) +{ + int n,nrw,irw,ims; + char base[NAME_SIZE],plt_file[NAME_SIZE],*p; + FILE *fout; + + p=strstr(fin,".ms1.dat"); + error(p==NULL,1,"print_plot [read2.c]","Unexpected data file name"); + n=p-fin; + + p=strrchr(fin,'/'); + if (p==NULL) + p=fin; + else + p+=1; + n-=(p-fin); + + error(n>=NAME_SIZE,1,"print_plot [read2.c]","File name is too long"); + strncpy(base,p,n); + base[n]='\0'; + + error(name_size("plots/%s.run2.dat",base)>=NAME_SIZE,1, + "print_plot [read2.c]","File name is too long"); + sprintf(plt_file,"plots/%s.run2.dat",base); + fout=fopen(plt_file,"w"); + error(fout==NULL,1,"print_plot [read2.c]", + "Unable to open output file"); + + nrw=file_head.nrw; + + fprintf(fout,"#\n"); + fprintf(fout,"# Data written by the program ms1\n"); + fprintf(fout,"# -------------------------------\n"); + fprintf(fout,"#\n"); + fprintf(fout,"# Number of measurements = %d\n",nms); + fprintf(fout,"#\n"); + fprintf(fout,"# nc: Configuration number\n"); + fprintf(fout,"# W: Normalized reweighting factors\n"); + fprintf(fout,"#\n"); + fprintf(fout,"# nc"); + + for (irw=0;irw"); + + printf("\n"); + printf("History of reweighting factors\n"); + printf("------------------------------\n\n"); + + read_file(argv[1]); + nrw=file_head.nrw; + nfct=file_head.nfct; + nsrc=file_head.nsrc; + + printf("The total number of measurements is %d.\n",nms); + printf("Integrated autocorrelation times and associated errors are "); + printf("estimated\n"); + + if (nms>100) + printf("using the numerically determined autocorrelation function.\n"); + else + printf("by binning and calculating jackknife errors.\n"); + + printf("Autocorrelation times are given in numbers of measurements.\n\n"); + + for (irw=0;irw1) + printf("Factorized into %d factors.\n",nfct[irw]); + if (nsrc[irw]>1) + printf("Using %d random source fields.\n\n",nsrc[irw]); + else + printf("Using 1 random source field.\n\n"); + + if (nms>=100) + print_auto(nms,avrw[irw][0]); + else + print_jack(1,nms,avrw[irw],f); + + printf("\n"); + } + + if (nrw!=1) + { + printf("Product of all reweighting factors:\n\n"); + + if (nms>=100) + print_auto(nms,avtot); + else + print_jack(1,nms,&avtot,f); + + printf("\n"); + } + + print_plot(argv[1]); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..39e3e1f63d02f94850ae3f4d22724fbe7032d93d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/INDEX @@ -0,0 +1,27 @@ + +Random number generation and related programs + + +check1 Correctness of ranlxs and ranlxd + +check2 Save state of ranlxs to a file and reset the generator + from the data on the file + +check3 Save state of ranlxd to a file and reset the generator + from the data on the file + +check4 Kolmogorov-Smirnov test of the random distribution produced + by gauss and gauss_dble + +check5 Statistical test of random_su3 + +check6 Statistical test of random_su3_dble + +check7 Reweighting of gaussian distributions [statistical test of + gauss_dble()] + +time1 Timing of ranlxs and gauss + +time2 Timing of ranlxd and gauss_dble + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..ad85d1a9a64cd6eb701ab9a0df1162a0bca6b766 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/Makefile @@ -0,0 +1,122 @@ +################################################################################ +# +# Makefile to compile and link C programs +# +# Version valid for Linux machines +# +# "make" compiles and links the specified main programs and modules +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files created by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and required modules + +MAIN = check1 check2 check3 check4 check5 check6 check7\ + time1 time2 + +RANDOM = ranlxs ranlxd gauss random_su3 + +UTILS = utils + +SU3FCTS = su3prod su3ren + +EXTRAS = chebyshev ks_test pchi_square stat + +MODULES = $(RANDOM) $(UTILS) $(SU3FCTS) $(EXTRAS) + + +# search path for modules + +MDIR = ../../../modules + +VPATH = $(MDIR)/nompi/extras:$(MDIR)/nompi/utils:\ + $(MDIR)/random:$(MDIR)/su3fcts + + +# additional include directories + +INCPATH = ../../../include/nompi ../../../include + + +# additional libraries to be included + +LIBS = m + +LIBPATH = + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing -fno-inline \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash + +CC=$(GCC) + +PGMS= $(MAIN) $(MODULES) + +INCDIRS = $(addprefix -I,$(INCPATH)) + +OBJECTS = $(addsuffix .o,$(MODULES)) + +LDFLAGS = $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(CC) -MM -ansi $(INCDIRS) $< -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(INCDIRS) -o $@ + + +# rule to link object files + +$(MAIN): %: %.o $(OBJECTS) Makefile + $(CC) $< $(OBJECTS) $(CFLAGS) $(LDFLAGS) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables and old error log file + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o .tmp $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..26a42c22b0c0ea2e5081071396263f8a1eb9746c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check1.c @@ -0,0 +1,315 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2005 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* This program checks that ranlxs and ranlxd implement the basic algorithm +* correctly +* +*******************************************************************************/ + +#include +#include +#include +#include "random.h" + +#define NXS 204 +#define NXD 99 + + +int main(void) +{ + int k,test1,test2; + int *state1,*state2; + float sbase; + float xs[NXS],ys[NXS],xsn[96]; + double base; + double xd[NXD],yd[NXD],xdn[48]; + + sbase=(float)(ldexp(1.0,24)); + base=ldexp(1.0,48); + state1=malloc(rlxs_size()*sizeof(int)); + state2=malloc(rlxd_size()*sizeof(int)); + + rlxs_init(0,32767); + rlxd_init(1,32767); + + +/******************************************************************************* +* +* Check that the correct sequences of random numbers are obtained +* +*******************************************************************************/ + + for (k=0;k<20;k++) + { + ranlxs(xs,NXS); + ranlxd(xd,NXD); + } + + xsn[0]=13257445.0f; + xsn[1]=15738482.0f; + xsn[2]=5448599.0f; + xsn[3]=9610459.0f; + xsn[4]=1046025.0f; + xsn[5]=2811360.0f; + xsn[6]=14923726.0f; + xsn[7]=2287739.0f; + xsn[8]=16133204.0f; + xsn[9]=16328320.0f; + xsn[10]=12980218.0f; + xsn[11]=9256959.0f; + xsn[12]=5633754.0f; + xsn[13]=7422961.0f; + xsn[14]=6032411.0f; + xsn[15]=14970828.0f; + xsn[16]=10717272.0f; + xsn[17]=2520878.0f; + xsn[18]=8906135.0f; + xsn[19]=8507426.0f; + xsn[20]=11925022.0f; + xsn[21]=12042827.0f; + xsn[22]=12263021.0f; + xsn[23]=4828801.0f; + xsn[24]=5300508.0f; + xsn[25]=13346776.0f; + xsn[26]=10869790.0f; + xsn[27]=8520207.0f; + xsn[28]=11213953.0f; + xsn[29]=14439320.0f; + xsn[30]=5716476.0f; + xsn[31]=13600448.0f; + xsn[32]=12545579.0f; + xsn[33]=3466523.0f; + xsn[34]=113906.0f; + xsn[35]=10407879.0f; + xsn[36]=12058596.0f; + xsn[37]=4390921.0f; + xsn[38]=1634350.0f; + xsn[39]=9823280.0f; + xsn[40]=12569690.0f; + xsn[41]=8267856.0f; + xsn[42]=5869501.0f; + xsn[43]=7210219.0f; + xsn[44]=1362361.0f; + xsn[45]=2956909.0f; + xsn[46]=504465.0f; + xsn[47]=6664636.0f; + xsn[48]=6048963.0f; + xsn[49]=1098525.0f; + xsn[50]=1261330.0f; + xsn[51]=2401071.0f; + xsn[52]=8087317.0f; + xsn[53]=1293933.0f; + xsn[54]=555494.0f; + xsn[55]=14872475.0f; + xsn[56]=11261534.0f; + xsn[57]=166813.0f; + xsn[58]=13424516.0f; + xsn[59]=15280818.0f; + xsn[60]=4644497.0f; + xsn[61]=6333595.0f; + xsn[62]=10012569.0f; + xsn[63]=6878028.0f; + xsn[64]=9176136.0f; + xsn[65]=8379433.0f; + xsn[66]=11073957.0f; + xsn[67]=2465529.0f; + xsn[68]=13633550.0f; + xsn[69]=12721649.0f; + xsn[70]=569725.0f; + xsn[71]=6375015.0f; + xsn[72]=2164250.0f; + xsn[73]=6725885.0f; + xsn[74]=7223108.0f; + xsn[75]=4890858.0f; + xsn[76]=11298261.0f; + xsn[77]=12086020.0f; + xsn[78]=4447706.0f; + xsn[79]=1164782.0f; + xsn[80]=1904399.0f; + xsn[81]=16669839.0f; + xsn[82]=2586766.0f; + xsn[83]=3605708.0f; + xsn[84]=15761082.0f; + xsn[85]=14937769.0f; + xsn[86]=13965017.0f; + xsn[87]=2175021.0f; + xsn[88]=16668997.0f; + xsn[89]=13996602.0f; + xsn[90]=6313099.0f; + xsn[91]=15646036.0f; + xsn[92]=9746447.0f; + xsn[93]=9596781.0f; + xsn[94]=9244169.0f; + xsn[95]=4731726.0f; + + xdn[0]=135665102723086.0; + xdn[1]=259840970195871.0; + xdn[2]=110726726657103.0; + xdn[3]=53972500363809.0; + xdn[4]=199301297412157.0; + xdn[5]=63744794353870.0; + xdn[6]=178745978725904.0; + xdn[7]=243549380863176.0; + xdn[8]=244796821836177.0; + xdn[9]=223788809121855.0; + xdn[10]=113720856430443.0; + xdn[11]=124607822268499.0; + xdn[12]=25705458431399.0; + xdn[13]=155476863764950.0; + xdn[14]=195602097736933.0; + xdn[15]=183038707238950.0; + xdn[16]=62268883953527.0; + xdn[17]=157047615112119.0; + xdn[18]=58134973897037.0; + xdn[19]=26908869337679.0; + xdn[20]=259927185454290.0; + xdn[21]=130534606773507.0; + xdn[22]=205295065526788.0; + xdn[23]=40201323262686.0; + xdn[24]=193822255723177.0; + xdn[25]=239720285097881.0; + xdn[26]=54433631586673.0; + xdn[27]=31313178820772.0; + xdn[28]=152904879618865.0; + xdn[29]=256187025780734.0; + xdn[30]=110292144635528.0; + xdn[31]=26555117184469.0; + xdn[32]=228913371644996.0; + xdn[33]=126837665590799.0; + xdn[34]=141069100232139.0; + xdn[35]=96171028602910.0; + xdn[36]=259271018918511.0; + xdn[37]=65257892816619.0; + xdn[38]=14254344610711.0; + xdn[39]=137794868158301.0; + xdn[40]=269703238916504.0; + xdn[41]=35782602710520.0; + xdn[42]=51447305327263.0; + xdn[43]=247852246697199.0; + xdn[44]=65072958134912.0; + xdn[45]=273325640150591.0; + xdn[46]=2768714666444.0; + xdn[47]=173907458721736.0; + + test1=0; + test2=0; + + for (k=0;k<96;k++) + { + if (xsn[k]!=(xs[k+60]*sbase)) + test1=1; + } + + for (k=0;k<48;k++) + { + if (xdn[k]!=(xd[k+39]*base)) + test2=1; + } + + if (test1==1) + { + printf("\n"); + printf("Test failed: ranlxs gives incorrect results\n"); + printf("=> do not use ranlxs on this machine\n"); + printf("\n"); + } + + if (test2==1) + { + printf("\n"); + printf("Test failed: ranlxd gives incorrect results\n"); + printf("=> do not use ranlxd on this machine\n"); + printf("\n"); + } + + +/******************************************************************************* +* +* Check of the I/O routines +* +*******************************************************************************/ + + rlxs_get(state1); + rlxd_get(state2); + + for (k=0;k<10;k++) + { + ranlxs(xs,NXS); + ranlxd(xd,NXD); + } + + rlxs_reset(state1); + rlxd_reset(state2); + + for (k=0;k<10;k++) + { + ranlxs(ys,NXS); + ranlxd(yd,NXD); + } + + for (k=0;k do not use ranlxs on this machine\n"); + printf("\n"); + } + + if (test2==2) + { + printf("\n"); + printf("Test failed: I/O routines for ranlxd do not work properly\n"); + printf("=> do not use ranlxd on this machine\n"); + printf("\n"); + } + + +/******************************************************************************* +* +* Success messages +* +*******************************************************************************/ + + if ((test1==0)&&(test2==0)) + { + printf("\n"); + printf("All tests passed\n"); + printf("=> ranlxs and ranlxd work correctly on this machine\n"); + printf("\n"); + } + else if (test1==0) + { + printf("\n"); + printf("All tests on ranlxs passed\n"); + printf("=> ranlxs works correctly on this machine\n"); + printf("\n"); + } + else if (test2==0) + { + printf("\n"); + printf("All tests on ranlxd passed\n"); + printf("=> ranlxd works correctly on this machine\n"); + printf("\n"); + } + exit(0); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..8a7045601bd145863522db5fa25b13c1297a6e18 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check2.c @@ -0,0 +1,104 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2005, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Writes the state of ranlxs to a file together with the next 147 random +* numbers. Then reads the data back in and checks the correct reinitialization +* of the generator +* +*******************************************************************************/ + +#include +#include +#include +#include +#include "random.h" +#include "utils.h" + +#define N 147 + + +static void write_state(void) +{ + FILE *fp; + int k,ns,*state; + float base,r[N]; + + ns=rlxs_size(); + state=malloc(ns*sizeof(int)); + base=(float)(ldexp(1.0,24)); + rlxs_init(1,1234567); + + for (k=0;k<10;k++) + ranlxs(r,N); + + rlxs_get(state); + ranlxs(r,N); + + fp=fopen(".tmp","w"); + + for (k=0;k +#include +#include +#include +#include "random.h" +#include "utils.h" + +#define N 147 + + +static void write_state(void) +{ + FILE *fp; + int k,ns,*state; + double base,r[N]; + + ns=rlxd_size(); + state=malloc(ns*sizeof(int)); + base=(double)(ldexp(1.0,48)); + rlxd_init(1,1234567); + + for (k=0;k<10;k++) + ranlxd(r,N); + + rlxd_get(state); + ranlxd(r,N); + + fp=fopen(".tmp","w"); + + for (k=0;k +#include +#include +#include "random.h" +#include "utils.h" +#include "su3fcts.h" +#include "extras.h" + + +int main(void) +{ + int i,n; + float *r; + double *rd,*f,x; + double kp,km,pp,pm; + + printf("\n"); + printf("Check of the distribution produced by gauss and gauss_dble\n"); + printf("----------------------------------------------------------\n"); + + for (;;) + { + printf("\n"); + printf("Specify number of trials (0 exits): "); + + if (scanf("%d",&n)==1) + { + printf("\n"); + + if (n<=0) + exit(0); + + f=amalloc(n*sizeof(double),3); + r=amalloc(n*sizeof(float),3); + + gauss(r,n); + + for (i=0;i=0) + f[i]=0.5+0.5*pchi_square(2.0*x*x,1); + else + f[i]=0.5-0.5*pchi_square(2.0*x*x,1); + } + + ks_test(n,f,&kp,&km); + ks_prob(n,kp,km,&pp,&pm); + + printf("Distribution produced by gauss\n"); + printf("Kolmogorov-Smirnov test: K+ = %4.2f, K- = %4.2f\n",kp,km); + printf("This corresponds to Prob(K+) = %4.2f, Prob(K-) = %4.2f\n", + pp,pm); + printf("\n"); + + afree(r); + rd=amalloc(n*sizeof(double),3); + gauss_dble(rd,n); + + for (i=0;i=0) + f[i]=0.5+0.5*pchi_square(2.0*x*x,1); + else + f[i]=0.5-0.5*pchi_square(2.0*x*x,1); + } + + ks_test(n,f,&kp,&km); + ks_prob(n,kp,km,&pp,&pm); + + printf("Distribution produced by gauss_dble\n"); + printf("Kolmogorov-Smirnov test: K+ = %4.2f, K- = %4.2f\n",kp,km); + printf("This corresponds to Prob(K+) = %4.2f, Prob(K-) = %4.2f\n", + pp,pm); + printf("\n"); + + afree(f); + afree(rd); + } + else + { + printf("Invalid input, program stopped\n\n"); + break; + } + } + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check5.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check5.c new file mode 100644 index 0000000000000000000000000000000000000000..f01ef1ac3686b93546e3828594bb84ff7d8eb10b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check5.c @@ -0,0 +1,165 @@ + +/******************************************************************************* +* +* File check5.c +* +* Copyright (C) 2005 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Statistical test of random_su3 +* +*******************************************************************************/ + +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "su3fcts.h" +#include "extras.h" + + +static void dev(su3 *u,double *d1,double *d2) +{ + int i; + float *r; + double d; + complex det1,det2,det3,det; + su3 v,w; + + _su3_dagger(v,(*u)); + _su3_times_su3(w,v,(*u)); + + w.c11.re-=1.0f; + w.c22.re-=1.0f; + w.c33.re-=1.0f; + + *d1=0.0; + r=(float*)(&w); + + for (i=0;i<18;i++) + { + d=fabs((double)(r[i])); + if (d>(*d1)) + *d1=d; + } + + det1.re= + ((*u).c22.re*(*u).c33.re-(*u).c22.im*(*u).c33.im)- + ((*u).c23.re*(*u).c32.re-(*u).c23.im*(*u).c32.im); + det1.im= + ((*u).c22.re*(*u).c33.im+(*u).c22.im*(*u).c33.re)- + ((*u).c23.re*(*u).c32.im+(*u).c23.im*(*u).c32.re); + det2.re= + ((*u).c21.re*(*u).c33.re-(*u).c21.im*(*u).c33.im)- + ((*u).c23.re*(*u).c31.re-(*u).c23.im*(*u).c31.im); + det2.im= + ((*u).c21.re*(*u).c33.im+(*u).c21.im*(*u).c33.re)- + ((*u).c23.re*(*u).c31.im+(*u).c23.im*(*u).c31.re); + det3.re= + ((*u).c21.re*(*u).c32.re-(*u).c21.im*(*u).c32.im)- + ((*u).c22.re*(*u).c31.re-(*u).c22.im*(*u).c31.im); + det3.im= + ((*u).c21.re*(*u).c32.im+(*u).c21.im*(*u).c32.re)- + ((*u).c22.re*(*u).c31.im+(*u).c22.im*(*u).c31.re); + + det.re= + ((*u).c11.re*det1.re-(*u).c11.im*det1.im)- + ((*u).c12.re*det2.re-(*u).c12.im*det2.im)+ + ((*u).c13.re*det3.re-(*u).c13.im*det3.im); + det.im= + ((*u).c11.re*det1.im+(*u).c11.im*det1.re)- + ((*u).c12.re*det2.im+(*u).c12.im*det2.re)+ + ((*u).c13.re*det3.im+(*u).c13.im*det3.re); + + *d2=0.0; + d=fabs((double)(det.re)-1.0); + if (d>(*d2)) + *d2=d; + d=fabs((double)(det.im)); + if (d>(*d2)) + *d2=d; +} + + +int main(void) +{ + int i,n; + float *rw,*rz,wsq,zsq; + double *a,abar,sig,d1,d2,dmax1,dmax2; + complex wuz; + su3_vector w,z,uz; + su3 u; + + printf("\n"); + printf("Statistical test of random_su3\n"); + printf("------------------------------\n\n"); + + dmax1=0.0; + dmax2=0.0; + + for (i=0;i<10000;i++) + { + random_su3(&u); + dev(&u,&d1,&d2); + + if (d1>dmax1) + dmax1=d1; + if (d2>dmax2) + dmax2=d2; + } + + printf("In 10000 trials:\n"); + printf("max |1-U^dag*U| = %.1e\n",dmax1); + printf("max |1-det U| = %.1e\n",dmax2); + + for (;;) + { + rw=(float*)(&w); + rz=(float*)(&z); + gauss(rw,6); + gauss(rz,6); + wsq=_vector_prod_re(w,w); + zsq=_vector_prod_re(z,z); + + printf("\n"); + printf("Specify number of trials (0 exits): "); + + if (scanf("%d",&n)==1) + { + printf("\n"); + + if (n<=0) + exit(0); + + a=amalloc(n*sizeof(double),3); + + for (i=0;i = %1.4f [error %.1e]\n",abar,sig); + printf("Exact = %1.4f\n",wsq*zsq/3.0f); + + afree(a); + } + else + { + printf("Invalid input, program stopped\n\n"); + break; + } + } + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check6.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check6.c new file mode 100644 index 0000000000000000000000000000000000000000..1da9276c58764dd27f61d3a33000b53e2815f5a7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check6.c @@ -0,0 +1,164 @@ + +/******************************************************************************* +* +* File check6.c +* +* Copyright (C) 2005, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Statistical test of random_su3_dble +* +*******************************************************************************/ + +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "su3fcts.h" +#include "extras.h" + + +static void dev(su3_dble *u,double *d1,double *d2) +{ + int i; + double d,*r; + complex_dble det1,det2,det3,det; + su3_dble v,w; + + _su3_dagger(v,(*u)); + _su3_times_su3(w,v,(*u)); + + w.c11.re-=1.0; + w.c22.re-=1.0; + w.c33.re-=1.0; + + *d1=0.0; + r=(double*)(&w); + + for (i=0;i<18;i++) + { + d=fabs(r[i]); + if (d>(*d1)) + *d1=d; + } + + det1.re= + ((*u).c22.re*(*u).c33.re-(*u).c22.im*(*u).c33.im)- + ((*u).c23.re*(*u).c32.re-(*u).c23.im*(*u).c32.im); + det1.im= + ((*u).c22.re*(*u).c33.im+(*u).c22.im*(*u).c33.re)- + ((*u).c23.re*(*u).c32.im+(*u).c23.im*(*u).c32.re); + det2.re= + ((*u).c21.re*(*u).c33.re-(*u).c21.im*(*u).c33.im)- + ((*u).c23.re*(*u).c31.re-(*u).c23.im*(*u).c31.im); + det2.im= + ((*u).c21.re*(*u).c33.im+(*u).c21.im*(*u).c33.re)- + ((*u).c23.re*(*u).c31.im+(*u).c23.im*(*u).c31.re); + det3.re= + ((*u).c21.re*(*u).c32.re-(*u).c21.im*(*u).c32.im)- + ((*u).c22.re*(*u).c31.re-(*u).c22.im*(*u).c31.im); + det3.im= + ((*u).c21.re*(*u).c32.im+(*u).c21.im*(*u).c32.re)- + ((*u).c22.re*(*u).c31.im+(*u).c22.im*(*u).c31.re); + + det.re= + ((*u).c11.re*det1.re-(*u).c11.im*det1.im)- + ((*u).c12.re*det2.re-(*u).c12.im*det2.im)+ + ((*u).c13.re*det3.re-(*u).c13.im*det3.im); + det.im= + ((*u).c11.re*det1.im+(*u).c11.im*det1.re)- + ((*u).c12.re*det2.im+(*u).c12.im*det2.re)+ + ((*u).c13.re*det3.im+(*u).c13.im*det3.re); + + *d2=0.0; + d=fabs(det.re-1.0); + if (d>(*d2)) + *d2=d; + d=fabs(det.im); + if (d>(*d2)) + *d2=d; +} + + +int main(void) +{ + int i,n; + double *rw,*rz,wsq,zsq; + double *a,abar,sig,d1,d2,dmax1,dmax2; + complex_dble wuz; + su3_vector_dble w,z,uz; + su3_dble u; + + printf("\n"); + printf("Statistical test of random_su3_dble\n"); + printf("-----------------------------------\n\n"); + + dmax1=0.0; + dmax2=0.0; + + for (i=0;i<10000;i++) + { + random_su3_dble(&u); + dev(&u,&d1,&d2); + + if (d1>dmax1) + dmax1=d1; + if (d2>dmax2) + dmax2=d2; + } + + printf("In 10000 trials:\n"); + printf("max |1-U^dag*U| = %.1e\n",dmax1); + printf("max |1-det U| = %.1e\n",dmax2); + + for (;;) + { + rw=(double*)(&w); + rz=(double*)(&z); + gauss_dble(rw,6); + gauss_dble(rz,6); + wsq=_vector_prod_re(w,w); + zsq=_vector_prod_re(z,z); + + printf("\n"); + printf("Specify number of trials (0 exits): "); + + if (scanf("%d",&n)==1) + { + printf("\n"); + + if (n<=0) + exit(0); + + a=amalloc(n*sizeof(double),3); + + for (i=0;i = %1.4f [error %.1e]\n",abar,sig); + printf("Exact = %1.4f\n",wsq*zsq/3.0); + + afree(a); + } + else + { + printf("Invalid input, program stopped\n\n"); + break; + } + } + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check7.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check7.c new file mode 100644 index 0000000000000000000000000000000000000000..ac9753f95c6ab2f13aa62d2c81cd0a0e79277a7e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check7.c @@ -0,0 +1,182 @@ + +/******************************************************************************* +* +* File check7.c +* +* Copyright (C) 2010 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Reweighting of gaussian distributions [statistical test of gauss_dble()] +* +*******************************************************************************/ + +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "extras.h" + +#define NSMALL 100 +#define NLARGE 1000 +#define NTEST 10000 + +#if (NLARGEdev[i]) + dev[i]=d; + } + } + + for (i=0;i<4;i++) + { + sig[i]/=(double)(NTEST); + sig[i]=sqrt(sig[i]); + } +} + + +int main(void) +{ + int i; + + printf("\n"); + printf("Reweighting of gaussian distributions\n"); + printf("-------------------------------------\n\n"); + + printf("Width of the distribution = 1.0\n"); + printf("Width of the observable = 2.0,3.0,4.0\n"); + printf("%d test simulations of size %d and %d\n\n",NTEST,NSMALL,NLARGE); + + alloc_arrays(); + + printf("Sample size %d:\n\n",NSMALL); + + for (i=0;i +#include +#include +#include "random.h" + +#define NRLX 100 +#define NGSS 24 +#define NLOOPS 100000 + + +int main(void) +{ + int k,level; + float t1,t2,dt; + float r[NRLX]; + + printf("\n"); + printf("Timing of ranlxs (average time per random number in microsec)\n\n"); + + for (level=0;level<=2;level++) + { + rlxs_init(level,1); + + t1=(float)clock(); + for (k=1;k<=NLOOPS;k++) + ranlxs(r,NRLX); + t2=(float)clock(); + + dt=(t2-t1)/(float)(CLOCKS_PER_SEC); + dt*=1.0e6f/(float)(NRLX*NLOOPS); + + printf("%4.3f (level %1d) ",dt,level); + } + + printf("\n\n"); + printf("Timing of gauss (average time per random number in microsec)\n\n"); + + for (level=0;level<=2;level++) + { + rlxs_init(level,1); + + t1=(float)clock(); + for (k=1;k<=NLOOPS;k++) + gauss(r,NGSS); + t2=(float)clock(); + + dt=(t2-t1)/(float)(CLOCKS_PER_SEC); + dt*=1.0e6f/(float)(NGSS*NLOOPS); + + printf("%4.3f (level %1d) ",dt,level); + } + + printf("\n\n"); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/time2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/time2.c new file mode 100644 index 0000000000000000000000000000000000000000..ea04eb898a608d7ac216b89e605bf244e78c6878 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/time2.c @@ -0,0 +1,73 @@ + +/******************************************************************************* +* +* File time2.c +* +* Copyright (C) 2005 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of ranlxd and gauss_dble +* +*******************************************************************************/ + +#include +#include +#include +#include "random.h" + +#define NRLX 100 +#define NGSS 24 +#define NLOOPS 100000 + + +int main(void) +{ + int k,level; + float t1,t2,dt; + double r[NRLX]; + + printf("\n"); + printf("Timing of ranlxd "); + printf("(average time per random number in microsec)\n\n"); + + for (level=1;level<=2;level++) + { + rlxd_init(level,1); + + t1=(float)clock(); + for (k=1;k<=NLOOPS;k++) + ranlxd(r,NRLX); + t2=(float)clock(); + + dt=(t2-t1)/(float)(CLOCKS_PER_SEC); + dt*=1.0e6f/(float)(NRLX*NLOOPS); + + printf("%4.3f (level %1d) ",dt,level); + } + + printf("\n\n"); + printf("Timing of gauss_dble "); + printf("(average time per random number in microsec)\n\n"); + + for (level=1;level<=2;level++) + { + rlxd_init(level,1); + + t1=(float)clock(); + for (k=1;k<=NLOOPS;k++) + gauss_dble(r,NGSS); + t2=(float)clock(); + + dt=(t2-t1)/(float)(CLOCKS_PER_SEC); + dt*=1.0e6f/(float)(NGSS*NLOOPS); + + printf("%4.3f (level %1d) ",dt,level); + } + + printf("\n\n"); + exit(0); +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..789dcbbed5cb2e0bf98b446db4fd39b0bb951c43 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/INDEX @@ -0,0 +1,12 @@ + +Rational approximations + +check1 Computation of the complete elliptic integral K(k) + +check2 Computation of the Jacobi elliptic functions sn,cn,dn + +check3 Zolotarev rational approximation to the sign function + +table1 Table of the relative error of the Zolotarev rational + approximation to the function f(x)=1/|x| (suitable for + plotting, for example) diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..bda117cabd83612d53b8b15fb39161f3c9003beb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/Makefile @@ -0,0 +1,119 @@ +################################################################################ +# +# Makefile to compile and link C programs +# +# Version valid for Linux machines +# +# "make" compiles and links the specified main programs and modules +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files created by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and required modules + +MAIN = check1 check2 check3 table1 + +RANDOM = ranlxd + +UTILS = utils + +RATFCTS = elliptic zolotarev + +MODULES = $(RANDOM) $(UTILS) $(RATFCTS) + + +# search path for modules + +MDIR = ../../../modules + +VPATH = $(MDIR)/nompi/utils:$(MDIR)/random:$(MDIR)/ratfcts:\ + $(MDIR)/utils + + +# additional include directories + +INCPATH = ../../../include/nompi ../../../include + + +# additional libraries to be included + +LIBS = m + +LIBPATH = + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 + + +############################## do not change ################################### + +SHELL=/bin/bash + +CC=$(GCC) + +PGMS= $(MAIN) $(MODULES) + +INCDIRS = $(addprefix -I,$(INCPATH)) + +OBJECTS = $(addsuffix .o,$(MODULES)) + +LDFLAGS = $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(CC) -MM -ansi $(INCDIRS) $< -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(INCDIRS) -o $@ + + +# rule to link object files + +$(MAIN): %: %.o $(OBJECTS) Makefile + $(CC) $< $(OBJECTS) $(CFLAGS) $(LDFLAGS) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables and old error log file + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..77e138958379e04e63c9a582e2a6dc847ce6c59c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/check1.c @@ -0,0 +1,109 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2008, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Computation of the complete elliptic integral K(k) +* +*******************************************************************************/ + +#include +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "ratfcts.h" + + +static double Ksmall(double rk) +{ + double c0,c1,c2,c3; + double k,p; + + c0=1.0; + c1=1.0/4.0; + c2=9.0/64.0; + c3=25.0/256.0; + + k=(rk*rk)/(1.0+rk*rk); + + p=c2+k*c3; + p=c1+k*p; + p=c0+k*p; + + return 2.0*atan(1.0)*p; +} + + +int main(void) +{ + int n; + double rk,k,kp,km,dev,dmax; + + printf("\n"); + printf("Computation of the complete elliptic integral K(k)\n"); + printf("--------------------------------------------------\n\n"); + + rlxd_init(1,1234); + + km=pow(DBL_EPSILON,0.125); + dmax=fabs(1.0-Ksmall(0.0)/ellipticK(0.0)); + + for (n=0;n<1000;n++) + { + ranlxd(&rk,1); + rk*=km; + + dev=fabs(1.0-Ksmall(rk)/ellipticK(rk)); + + if (dev>dmax) + dmax=dev; + } + + printf("Small k region: maximal relative error = %.1e\n",dmax); + + dmax=0.0; + + for (n=0;n<1000;n++) + { + ranlxd(&rk,1); + rk=rk/(1.0-rk); + + k=rk/sqrt(1.0+rk*rk); + kp=1.0/sqrt(1.0+rk*rk); + + dev=fabs(1.0- + ellipticK(2.0*sqrt(k)*(1.0+k)/(kp*kp))/ + ((1.0+k)*ellipticK(rk))); + + if (dev>dmax) + dmax=dev; + } + + printf("Gauss transformation: maximal relative error = %.1e\n\n",dmax); + printf("Print values at specfied k/k'\n\n"); + + for (;;) + { + printf("k/k' = "); + + if (scanf("%lf",&rk)==1) + { + printf("k = %.8e, k' = %.8e, K(k) = %.16e\n\n", + rk/sqrt(1.0+rk*rk),1.0/sqrt(1.0+rk*rk),ellipticK(rk)); + } + else + { + printf("Invalid input value, program stopped\n\n"); + break; + } + } + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..00f4401bd7fe795c611483d397afcfe1e2f9e62e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/check2.c @@ -0,0 +1,266 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2008, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Computation of the Jacobi elliptic functions sn,cn,dn +* +*******************************************************************************/ + +#include +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "ratfcts.h" + + +static void sncndn_smallk(double u,double rk,double *sn,double *cn,double *dn) +{ + int n,nmax; + double K,Kp,pi; + double t,v,tn,vn,k,r; + + K=ellipticK(rk); + Kp=ellipticK(1.0/rk); + pi=4.0*atan(1.0); + + t=pi*Kp/K; + v=(pi*u)/(2.0*K); + + nmax=(int)(-1.5*log(DBL_EPSILON)/t); + if (nmax<1) + nmax=1; + + (*sn)=0.0; + (*cn)=0.0; + (*dn)=0.0; + + for (n=nmax;n>=0;n--) + { + tn=(double)(n)*t; + vn=(double)(2*n+1)*v; + + (*sn)+=(exp(-tn)*sin(vn)/(1.0-exp(-2.0*tn-t))); + (*cn)+=(exp(-tn)*cos(vn)/(1.0+exp(-2.0*tn-t))); + + if (n>0) + { + vn=(double)(2*n)*v; + (*dn)+=(exp(-tn)*cos(vn)/(1.0+exp(-2.0*tn))); + } + } + + k=rk/sqrt(1.0+rk*rk); + r=(2.0*pi*exp(-0.5*t))/(k*K); + + (*sn)*=r; + (*cn)*=r; + (*dn)=(pi/(2.0*K))*(1.0+4.0*(*dn)); +} + + +static void sncndn_landen(double u,double rk,double *sn,double *cn,double *dn) +{ + double k,kp,kt,ktp,r; + + kp=1.0/sqrt(1.0+rk*rk); + k=rk*kp; + kt=k/(1.0+kp); + kt=(kt*kt); + ktp=2.0*sqrt(kp)/(1.0+kp); + + sncndn(u/(1.0+kt),kt/ktp,sn,cn,dn); + + r=1.0/(1.0+kt*(*sn)*(*sn)); + + (*sn)=(1.0+kt)*(*sn)*r; + (*cn)=(*cn)*(*dn)*r; + (*dn)=sqrt(kp*kp+k*k*(*cn)*(*cn)); +} + + +int main(void) +{ + int n; + double u,rk,K; + double sn,cn,dn,snsk,cnsk,dnsk; + double dmax_sn,dmax_cn,dmax_dn,dev; + + printf("\n"); + printf("Computation of the Jacobi elliptic functions sn,cn,dn\n"); + printf("-----------------------------------------------------\n\n"); + + rlxd_init(1,1234); + + dmax_sn=0.0; + dmax_cn=0.0; + dmax_dn=0.0; + + for (n=0;n<10000;n++) + { + ranlxd(&rk,1); + rk*=0.1; + K=ellipticK(rk); + ranlxd(&u,1); + u=K*(0.5-u); + + sncndn(u,rk,&sn,&cn,&dn); + sncndn_smallk(u,rk,&snsk,&cnsk,&dnsk); + + if (sn!=0.0) + { + dev=fabs(1.0-snsk/sn); + + if (dev>dmax_sn) + dmax_sn=dev; + } + + dev=fabs(1.0-cnsk/cn); + + if (dev>dmax_cn) + dmax_cn=dev; + + dev=fabs(1.0-dnsk/dn); + + if (dev>dmax_dn) + dmax_dn=dev; + } + + printf("-K/2<=u<=K/2, rk<=0.1:\n"); + printf("maximal relative error (sn,cn,dn) = (%.1e,%.1e,%.1e)\n\n", + dmax_sn,dmax_cn,dmax_dn); + + dmax_sn=0.0; + dmax_cn=0.0; + dmax_dn=0.0; + + for (n=0;n<10000;n++) + { + ranlxd(&rk,1); + rk*=0.1; + K=ellipticK(rk); + ranlxd(&u,1); + u=16.0*K*(0.5-u); + + sncndn(u,rk,&sn,&cn,&dn); + sncndn_smallk(u,rk,&snsk,&cnsk,&dnsk); + + dev=fabs(snsk-sn); + + if (dev>dmax_sn) + dmax_sn=dev; + + dev=fabs(cnsk-cn); + + if (dev>dmax_cn) + dmax_cn=dev; + + dev=fabs(dnsk-dn); + + if (dev>dmax_dn) + dmax_dn=dev; + } + + printf("-8*K<=u<=8K, rk<=0.1:\n"); + printf("maximal absolute error (sn,cn,dn) = (%.1e,%.1e,%.1e)\n\n", + dmax_sn,dmax_cn,dmax_dn); + + dmax_sn=0.0; + dmax_cn=0.0; + dmax_dn=0.0; + + for (n=0;n<10000;n++) + { + ranlxd(&rk,1); + rk=rk/(1.0-rk); + K=ellipticK(rk); + ranlxd(&u,1); + u=K*(0.5-u); + + sncndn(u,rk,&sn,&cn,&dn); + sncndn_landen(u,rk,&snsk,&cnsk,&dnsk); + + if (sn!=0.0) + { + dev=fabs(1.0-snsk/sn); + + if (dev>dmax_sn) + dmax_sn=dev; + } + + dev=fabs(1.0-cnsk/cn); + + if (dev>dmax_cn) + dmax_cn=dev; + + dev=fabs(1.0-dnsk/dn); + + if (dev>dmax_dn) + dmax_dn=dev; + } + + printf("-K/2<=u<=K/2, Landen recursion:\n"); + printf("maximal relative error (sn,cn,dn) = (%.1e,%.1e,%.1e)\n\n", + dmax_sn,dmax_cn,dmax_dn); + + dmax_sn=0.0; + dmax_cn=0.0; + dmax_dn=0.0; + + for (n=0;n<10000;n++) + { + ranlxd(&rk,1); + rk=rk/(1.0-rk); + K=ellipticK(rk); + ranlxd(&u,1); + u=16.0*K*(0.5-u); + + sncndn(u,rk,&sn,&cn,&dn); + sncndn_landen(u,rk,&snsk,&cnsk,&dnsk); + + dev=fabs(snsk-sn); + + if (dev>dmax_sn) + dmax_sn=dev; + + dev=fabs(cnsk-cn); + + if (dev>dmax_cn) + dmax_cn=dev; + + dev=fabs(dnsk-dn); + + if (dev>dmax_dn) + dmax_dn=dev; + } + + printf("-8*K<=u<=8K, Landen recursion:\n"); + printf("maximal absolute error (sn,cn,dn) = (%.1e,%.1e,%.1e)\n\n", + dmax_sn,dmax_cn,dmax_dn); + printf("Print values at specfied u and k/k'\n\n"); + + for (;;) + { + printf("u, k/k' = "); + + if (scanf("%lf %lf",&u,&rk)==2) + { + sncndn(u,rk,&sn,&cn,&dn); + printf("sn = %.16e, cn = %.16e, dn = %.16e\n",sn,cn,dn); + } + else + { + printf("Invalid input values, program stopped\n\n"); + break; + } + } + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..91b8ce42138941c907fd0126aeeff47682e11f35 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/check3.c @@ -0,0 +1,122 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2008 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Zolotarev rational approximation to the function f(x)=1/|x| +* +*******************************************************************************/ + +#include +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "ratfcts.h" + +static int ns=0; +static double As,*ars; + + +static void alloc_ars(int n) +{ + if (n<=ns) + return; + + if (ns!=0) + afree(ars); + + ars=amalloc(2*n*sizeof(double),3); + + error(ars==NULL,1,"alloc_ars [check3.c]", + "Unable to allocate coefficient array"); + + ns=n; +} + + +static double Zolo(int n,double y) +{ + int r; + double p; + + p=1.0; + + for (r=0;rdmax) + dmax=dev; + } + + printf("Relative error delta = %.1e (measured: %.1e)\n",delta,dmax); + printf("Amplitude A: %.1e\n",As); + printf("Coefficients a_r: Numerator Denominator\n"); + b=b*b; + + for (r=0;r<(2*n);r+=2) + { + printf(" %.3e %.3e\n", + ars[r]*b,ars[r+1]*b); + } + + printf("\n\n"); + } + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/table1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/table1.c new file mode 100644 index 0000000000000000000000000000000000000000..fffedaa19d93936472fab9909a7e843363ffcd57 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/table1.c @@ -0,0 +1,125 @@ + +/******************************************************************************* +* +* File table1.c +* +* Copyright (C) 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Table of the relative error of the Zolotarev rational approximation to the +* function f(x)=1/|x| (suitable for plotting, for example) +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "utils.h" +#include "ratfcts.h" + +static int n; +static double eps,delta,As,*ars; + + +static void alloc_ars(void) +{ + ars=amalloc(2*n*sizeof(double),3); + + error(ars==NULL,1,"alloc_ars [table1.c]", + "Unable to allocate coefficient array"); +} + + +static double zolotarev_sign(double x) +{ + int r; + double y,p; + + y=x*x; + p=1.0; + + for (r=0;r +#include +#include +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "su3fcts.h" + + +static double max_dev(su3_dble *u,su3_dble *v) +{ + int i; + double r[18],s[18]; + double nrm,d,dmax; + + r[ 0]=(*u).c11.re; + r[ 1]=(*u).c11.im; + r[ 2]=(*u).c12.re; + r[ 3]=(*u).c12.im; + r[ 4]=(*u).c13.re; + r[ 5]=(*u).c13.im; + + r[ 6]=(*u).c21.re; + r[ 7]=(*u).c21.im; + r[ 8]=(*u).c22.re; + r[ 9]=(*u).c22.im; + r[10]=(*u).c23.re; + r[11]=(*u).c23.im; + + r[12]=(*u).c31.re; + r[13]=(*u).c31.im; + r[14]=(*u).c32.re; + r[15]=(*u).c32.im; + r[16]=(*u).c33.re; + r[17]=(*u).c33.im; + + s[ 0]=(*v).c11.re; + s[ 1]=(*v).c11.im; + s[ 2]=(*v).c12.re; + s[ 3]=(*v).c12.im; + s[ 4]=(*v).c13.re; + s[ 5]=(*v).c13.im; + + s[ 6]=(*v).c21.re; + s[ 7]=(*v).c21.im; + s[ 8]=(*v).c22.re; + s[ 9]=(*v).c22.im; + s[10]=(*v).c23.re; + s[11]=(*v).c23.im; + + s[12]=(*v).c31.re; + s[13]=(*v).c31.im; + s[14]=(*v).c32.re; + s[15]=(*v).c32.im; + s[16]=(*v).c33.re; + s[17]=(*v).c33.im; + + nrm=0.0; + dmax=0.0; + + for (i=0;i<18;i++) + { + nrm+=r[i]*r[i]; + d=(r[i]-s[i])*(r[i]-s[i]); + i+=1; + nrm+=r[i]*r[i]; + d+=(r[i]-s[i])*(r[i]-s[i]); + + if (d>dmax) + dmax=d; + } + + return sqrt(dmax/nrm); +} + + +static void random_u3alg(u3_alg_dble *X) +{ + double r[9]; + + ranlxd(r,9); + + (*X).c1=r[0]-0.5; + (*X).c2=r[1]-0.5; + (*X).c3=r[2]-0.5; + (*X).c4=r[3]-0.5; + (*X).c5=r[4]-0.5; + (*X).c6=r[5]-0.5; + (*X).c7=r[6]-0.5; + (*X).c8=r[7]-0.5; + (*X).c9=r[8]-0.5; +} + + +static void X2u(u3_alg_dble *X,su3_dble *u) +{ + (*u).c11.re=0.0; + (*u).c11.im= (*X).c1; + (*u).c22.re=0.0; + (*u).c22.im= (*X).c2; + (*u).c33.re=0.0; + (*u).c33.im= (*X).c3; + + (*u).c12.re= (*X).c4; + (*u).c12.im= (*X).c5; + (*u).c21.re=-(*X).c4; + (*u).c21.im= (*X).c5; + + (*u).c13.re= (*X).c6; + (*u).c13.im= (*X).c7; + (*u).c31.re=-(*X).c6; + (*u).c31.im= (*X).c7; + + (*u).c23.re= (*X).c8; + (*u).c23.im= (*X).c9; + (*u).c32.re=-(*X).c8; + (*u).c32.im= (*X).c9; +} + + +int main(void) +{ + double d1,d2,d3,d4; + su3_dble *u,*v,*w1,*w2; + u3_alg_dble *X; + + printf("\n"); + printf("Check of su3xsu3, su3dagxsu3, ...\n"); + printf("---------------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX instructions\n\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + u=amalloc(4*sizeof(su3_dble),4); + X=amalloc(sizeof(u3_alg_dble),3); + error((u==NULL)||(X==NULL),1,"main [check1.c]", + "Unable to allocate auxiliary arrays"); + + v=u+1; + w1=u+2; + w2=u+3; + + rlxd_init(1,23456); + + random_su3_dble(u); + random_su3_dble(v); + su3xsu3(u,v,w1); + _su3_times_su3(*w2,*u,*v); + d1=max_dev(w1,w2); + + random_su3_dble(u); + random_su3_dble(v); + su3dagxsu3(u,v,w1); + _su3_dagger(*w2,*u); + *u=*w2; + _su3_times_su3(*w2,*u,*v); + d2=max_dev(w1,w2); + + random_su3_dble(u); + random_su3_dble(v); + su3xsu3dag(u,v,w1); + _su3_dagger(*w2,*v); + *v=*w2; + _su3_times_su3(*w2,*u,*v); + d3=max_dev(w1,w2); + + random_su3_dble(u); + random_su3_dble(v); + su3dagxsu3dag(u,v,w1); + _su3_dagger(*w2,*u); + *u=*w2; + _su3_dagger(*w2,*v); + *v=*w2; + _su3_times_su3(*w2,*u,*v); + d4=max_dev(w1,w2); + + printf("su3xsu3: %.2e\n",d1); + printf("su3dagxsu3: %.2e\n",d2); + printf("su3xsu3dag: %.2e\n",d3); + printf("su3dagxsu3dag: %.2e\n",d4); + + random_su3_dble(u); + random_u3alg(X); + su3xu3alg(u,X,w1); + X2u(X,v); + _su3_times_su3(*w2,*u,*v); + d1=max_dev(w1,w2); + + random_su3_dble(u); + random_u3alg(X); + su3dagxu3alg(u,X,w1); + _su3_dagger(*w2,*u); + *u=*w2; + X2u(X,v); + _su3_times_su3(*w2,*u,*v); + d2=max_dev(w1,w2); + + random_su3_dble(v); + random_u3alg(X); + u3algxsu3(X,v,w1); + X2u(X,u); + _su3_times_su3(*w2,*u,*v); + d3=max_dev(w1,w2); + + random_su3_dble(v); + random_u3alg(X); + u3algxsu3dag(X,v,w1); + X2u(X,u); + _su3_dagger(*w2,*v); + *v=*w2; + _su3_times_su3(*w2,*u,*v); + d4=max_dev(w1,w2); + + printf("su3xu3alg: %.2e\n",d1); + printf("su3dagxu3alg: %.2e\n",d2); + printf("u3algxsu3: %.2e\n",d3); + printf("u3algxsu3dag: %.2e\n\n",d4); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..3da3c2271d5bb36d7f0247f10e738288fa263988 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check2.c @@ -0,0 +1,239 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2005, 2009, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of prod2su3alg, prod2u3alg and rotate_su3alg +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "su3fcts.h" + +static const su3_vector_dble vd0={{0.0}}; +static const spinor_dble sd0={{{0.0}}}; +static su3_dble Q,u,v ALIGNED16; +static su3_alg_dble X ALIGNED16; +static u3_alg_dble Y; + + +static void random_su3alg(su3_alg_dble *X) +{ + double r[8]; + + ranlxd(r,8); + + (*X).c1=r[0]-0.5; + (*X).c2=r[1]-0.5; + (*X).c3=r[2]-0.5; + (*X).c4=r[3]-0.5; + (*X).c5=r[4]-0.5; + (*X).c6=r[5]-0.5; + (*X).c7=r[6]-0.5; + (*X).c8=r[7]-0.5; +} + + +static void random_u3alg(u3_alg_dble *X) +{ + double r[9]; + + ranlxd(r,9); + + (*X).c1=r[0]-0.5; + (*X).c2=r[1]-0.5; + (*X).c3=r[2]-0.5; + (*X).c4=r[3]-0.5; + (*X).c5=r[4]-0.5; + (*X).c6=r[5]-0.5; + (*X).c7=r[6]-0.5; + (*X).c8=r[7]-0.5; + (*X).c9=r[8]-0.5; +} + + +static void X2u(su3_alg_dble *X,su3_dble *u) +{ + (*u).c11.re=0.0; + (*u).c11.im= (*X).c1+(*X).c2; + (*u).c22.re=0.0; + (*u).c22.im= (*X).c2-2.0*(*X).c1; + (*u).c33.re=0.0; + (*u).c33.im= (*X).c1-2.0*(*X).c2; + + (*u).c12.re= (*X).c3; + (*u).c12.im= (*X).c4; + (*u).c21.re=-(*X).c3; + (*u).c21.im= (*X).c4; + + (*u).c13.re= (*X).c5; + (*u).c13.im= (*X).c6; + (*u).c31.re=-(*X).c5; + (*u).c31.im= (*X).c6; + + (*u).c23.re= (*X).c7; + (*u).c23.im= (*X).c8; + (*u).c32.re=-(*X).c7; + (*u).c32.im= (*X).c8; +} + + +int main(void) +{ + double tr,d; + + printf("\n"); + printf("Check of prod2su3alg and rotate_su3alg\n"); + printf("--------------------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX instructions\n\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + rlxd_init(1,23456); + + printf("prod2su3alg:\n"); + random_su3_dble(&u); + random_su3_dble(&v); + random_su3alg(&X); + + tr=prod2su3alg(&u,&v,&X); + _su3_times_su3(Q,u,v); + tr-=(Q.c11.re+Q.c22.re+Q.c33.re); + + Q.c11.re=0.5*(Q.c11.re-Q.c11.re); + Q.c11.im=0.5*(Q.c11.im+Q.c11.im); + Q.c12.re=0.5*(Q.c12.re-Q.c21.re); + Q.c12.im=0.5*(Q.c12.im+Q.c21.im); + Q.c13.re=0.5*(Q.c13.re-Q.c31.re); + Q.c13.im=0.5*(Q.c13.im+Q.c31.im); + + Q.c22.re=0.5*(Q.c22.re-Q.c22.re); + Q.c22.im=0.5*(Q.c22.im+Q.c22.im); + Q.c23.re=0.5*(Q.c23.re-Q.c32.re); + Q.c23.im=0.5*(Q.c23.im+Q.c32.im); + + Q.c33.re=0.5*(Q.c33.re-Q.c33.re); + Q.c33.im=0.5*(Q.c33.im+Q.c33.im); + + d=(Q.c11.im+Q.c22.im+Q.c33.im)/3.0; + Q.c11.im-=d; + Q.c22.im-=d; + Q.c33.im-=d; + + d=fabs(Q.c11.im-X.c1-X.c2); + printf("X.c11.im: %.2e\n",d); + d=fabs(Q.c22.im+2.0*X.c1-X.c2); + printf("X.c22.im: %.2e\n",d); + d=fabs(Q.c33.im-X.c1+2.0*X.c2); + printf("X.c33.im: %.2e\n",d); + + d=fabs(Q.c12.re-X.c3); + printf("X.c12.re: %.2e\n",d); + d=fabs(Q.c12.im-X.c4); + printf("X.c12.im: %.2e\n",d); + + d=fabs(Q.c13.re-X.c5); + printf("X.c13.re: %.2e\n",d); + d=fabs(Q.c13.im-X.c6); + printf("X.c13.im: %.2e\n",d); + + d=fabs(Q.c23.re-X.c7); + printf("X.c23.re: %.2e\n",d); + d=fabs(Q.c23.im-X.c8); + printf("X.c23.im: %.2e\n",d); + d=fabs(tr); + printf("Return value: %.2e\n\n",d); + + printf("prod2u3alg:\n"); + random_su3_dble(&u); + random_su3_dble(&v); + random_u3alg(&Y); + + prod2u3alg(&u,&v,&Y); + _su3_times_su3(Q,u,v); + + Q.c11.re=Q.c11.re-Q.c11.re; + Q.c11.im=Q.c11.im+Q.c11.im; + Q.c12.re=Q.c12.re-Q.c21.re; + Q.c12.im=Q.c12.im+Q.c21.im; + Q.c13.re=Q.c13.re-Q.c31.re; + Q.c13.im=Q.c13.im+Q.c31.im; + + Q.c22.re=Q.c22.re-Q.c22.re; + Q.c22.im=Q.c22.im+Q.c22.im; + Q.c23.re=Q.c23.re-Q.c32.re; + Q.c23.im=Q.c23.im+Q.c32.im; + + Q.c33.re=Q.c33.re-Q.c33.re; + Q.c33.im=Q.c33.im+Q.c33.im; + + d=fabs(Q.c11.im-Y.c1); + printf("X.c11.im: %.2e\n",d); + d=fabs(Q.c22.im-Y.c2); + printf("X.c22.im: %.2e\n",d); + d=fabs(Q.c33.im-Y.c3); + printf("X.c33.im: %.2e\n",d); + + d=fabs(Q.c12.re-Y.c4); + printf("X.c12.re: %.2e\n",d); + d=fabs(Q.c12.im-Y.c5); + printf("X.c12.im: %.2e\n",d); + + d=fabs(Q.c13.re-Y.c6); + printf("X.c13.re: %.2e\n",d); + d=fabs(Q.c13.im-Y.c7); + printf("X.c13.im: %.2e\n",d); + + d=fabs(Q.c23.re-Y.c8); + printf("X.c23.re: %.2e\n",d); + d=fabs(Q.c23.im-Y.c9); + printf("X.c23.im: %.2e\n\n",d); + + printf("rotate_su3alg:\n"); + random_su3_dble(&u); + random_su3alg(&X); + X2u(&X,&v); + + rotate_su3alg(&u,&X); + + _su3_times_su3(Q,u,v); + _su3_dagger(v,u); + _su3_times_su3(u,Q,v); + + d=fabs(u.c11.im-X.c1-X.c2); + printf("X.c11.im: %.2e\n",d); + d=fabs(u.c22.im+2.0*X.c1-X.c2); + printf("X.c22.im: %.2e\n",d); + d=fabs(u.c33.im-X.c1+2.0*X.c2); + printf("X.c33.im: %.2e\n",d); + + d=fabs(u.c12.re-X.c3); + printf("X.c12.re: %.2e\n",d); + d=fabs(u.c12.im-X.c4); + printf("X.c12.im: %.2e\n",d); + + d=fabs(u.c13.re-X.c5); + printf("X.c13.re: %.2e\n",d); + d=fabs(u.c13.im-X.c6); + printf("X.c13.im: %.2e\n",d); + + d=fabs(u.c23.re-X.c7); + printf("X.c23.re: %.2e\n",d); + d=fabs(u.c23.im-X.c8); + printf("X.c23.im: %.2e\n\n",d); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..2e13369897adf0e93b6e4af7918da43e7593191e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check3.c @@ -0,0 +1,177 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2009, 2011 Filippo Palombi, Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of chexp_drv0() and ch2mat() using the spectral representation of X +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "utils.h" +#include "random.h" +#include "su3fcts.h" + +#define NTEST 100000 +#define SEED 58693 + +static double mu[3],t,d; +static su3_alg_dble *X; +static su3_dble *r,*u,*v,*w; +static const su3_dble u0={{0.0}}; +static ch_drv0_t *sp; + + +static void alloc_Xu(void) +{ + X=amalloc(1*sizeof(*X),4); + r=amalloc(4*sizeof(*r),4); + sp=amalloc(1*sizeof(*sp),4); + + error((X==NULL)||(r==NULL)||(sp==NULL),1, + "alloc_Xu [check3.c]","Unable to allocate matrices"); + + u=r+1; + v=r+2; + w=r+3; +} + + +static void random_Xu(void) +{ + for (;;) + { + ranlxd(mu,2); + mu[0]=2.0*mu[0]-1.0; + mu[1]=2.0*mu[1]-1.0; + mu[2]=-mu[0]-mu[1]; + + if (fabs(mu[2])<=1.0) + break; + } + + t=0.5*(mu[0]*mu[0]+mu[1]*mu[1]+mu[2]*mu[2]); + d=mu[0]*mu[1]*mu[2]; + + (*u)=u0; + (*u).c11.im=mu[0]; + (*u).c22.im=mu[1]; + (*u).c33.im=mu[2]; + + random_su3_dble(r); + su3xsu3(r,u,w); + su3xsu3dag(w,r,u); + + (*X).c1=((*u).c11.im-(*u).c22.im)/3.0; + (*X).c2=((*u).c11.im-(*u).c33.im)/3.0; + (*X).c3=(*u).c12.re; + (*X).c4=(*u).c12.im; + (*X).c5=(*u).c13.re; + (*X).c6=(*u).c13.im; + (*X).c7=(*u).c23.re; + (*X).c8=(*u).c23.im; + + (*u)=u0; + (*u).c11.re=cos(mu[0]); + (*u).c22.re=cos(mu[1]); + (*u).c33.re=cos(mu[2]); + (*u).c11.im=sin(mu[0]); + (*u).c22.im=sin(mu[1]); + (*u).c33.im=sin(mu[2]); + + su3xsu3(r,u,w); + su3xsu3dag(w,r,u); +} + + +static double dev_uv(void) +{ + int i; + double r[18],dev,dmax; + + r[ 0]=(*u).c11.re-(*v).c11.re; + r[ 1]=(*u).c11.im-(*v).c11.im; + r[ 2]=(*u).c12.re-(*v).c12.re; + r[ 3]=(*u).c12.im-(*v).c12.im; + r[ 4]=(*u).c13.re-(*v).c13.re; + r[ 5]=(*u).c13.im-(*v).c13.im; + + r[ 6]=(*u).c21.re-(*v).c21.re; + r[ 7]=(*u).c21.im-(*v).c21.im; + r[ 8]=(*u).c22.re-(*v).c22.re; + r[ 9]=(*u).c22.im-(*v).c22.im; + r[10]=(*u).c23.re-(*v).c23.re; + r[11]=(*u).c23.im-(*v).c23.im; + + r[12]=(*u).c31.re-(*v).c31.re; + r[13]=(*u).c31.im-(*v).c31.im; + r[14]=(*u).c32.re-(*v).c32.re; + r[15]=(*u).c32.im-(*v).c32.im; + r[16]=(*u).c33.re-(*v).c33.re; + r[17]=(*u).c33.im-(*v).c33.im; + + dmax=0.0; + + for (i=0;i<18;i++) + { + dev=fabs(r[i]); + if (dev>dmax) + dmax=dev; + } + + return dmax; +} + + +int main(void) +{ + int i; + double dev,dmax1,dmax2,dmax3; + + printf("\n"); + printf("Check of chexp_drv0() and ch2mat()\n"); + printf("----------------------------------\n\n"); + + printf("Test performed on %d random matrices X using the\n",NTEST); + printf("spectral representation of X\n\n"); + + rlxd_init(1,SEED); + alloc_Xu(); + + dmax1=0.0; + dmax2=0.0; + dmax3=0.0; + + for (i=0;idmax1) + dmax1=dev; + + dev=fabs(d-(*sp).d); + if (dev>dmax2) + dmax2=dev; + + ch2mat((*sp).p,X,v); + dev=dev_uv(); + if (dev>dmax3) + dmax3=dev; + } + + printf ("Maximal deviation of t = %.1e\n",dmax1); + printf ("Maximal deviation of d = %.1e\n",dmax2); + printf ("Maximal deviation of exp(X) = %.1e\n\n",dmax3); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check4.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check4.c new file mode 100644 index 0000000000000000000000000000000000000000..74be208f776f0bea8d27ae50e9999c64ec6b7f3a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check4.c @@ -0,0 +1,238 @@ + +/******************************************************************************* +* +* File check4.c +* +* Copyright (C) 2009, 2011 Filippo Palombi, Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Comparison of chexp_drv2() with chexp_drv0() and chexp_drv1() and +* invariance of the calculated coefficients under rotations of X +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "utils.h" +#include "random.h" +#include "su3fcts.h" + +#define NTEST 100000 +#define SEED 773 + +static double mu[3]; +static su3_alg_dble *X; +static su3_dble *r,*u,*w; +static const su3_dble u0={{0.0}}; +static ch_drv0_t *sp; +static ch_drv1_t *sg; +static ch_drv2_t *sf; + + +static void alloc_Xu(void) +{ + X=amalloc(1*sizeof(*X),4); + r=amalloc(3*sizeof(*r),4); + sp=amalloc(1*sizeof(*sp),4); + sg=amalloc(1*sizeof(*sg),4); + sf=amalloc(2*sizeof(*sf),4); + + error((X==NULL)||(r==NULL)||(sp==NULL)||(sg==NULL)||(sf==NULL),1, + "alloc_Xu [check4.c]","Unable to allocate matrices"); + + u=r+1; + w=r+2; +} + + +static void random_Xu(void) +{ + for (;;) + { + ranlxd(mu,2); + mu[0]=2.0*mu[0]-1.0; + mu[1]=2.0*mu[1]-1.0; + mu[2]=-mu[0]-mu[1]; + + if (fabs(mu[2])<=1.0) + break; + } + + (*u)=u0; + (*u).c11.im=mu[0]; + (*u).c22.im=mu[1]; + (*u).c33.im=mu[2]; + + random_su3_dble(r); + su3xsu3(r,u,w); + su3xsu3dag(w,r,u); + + (*X).c1=((*u).c11.im-(*u).c22.im)/3.0; + (*X).c2=((*u).c11.im-(*u).c33.im)/3.0; + (*X).c3=(*u).c12.re; + (*X).c4=(*u).c12.im; + (*X).c5=(*u).c13.re; + (*X).c6=(*u).c13.im; + (*X).c7=(*u).c23.re; + (*X).c8=(*u).c23.im; +} + + +static double dev_sp(void) +{ + int i; + double r[8],dev,dmax; + + r[0]=(*sp).t-(*sf).t; + r[1]=(*sp).d-(*sf).d; + + for (i=0;i<3;i++) + { + r[2*i+2]=(*sp).p[i].re-(*sf).p[i].re; + r[2*i+3]=(*sp).p[i].im-(*sf).p[i].im; + } + + dmax=0.0; + + for (i=0;i<8;i++) + { + dev=fabs(r[i]); + if (dev>dmax) + dmax=dev; + } + + return dmax; +} + + +static double dev_sg(void) +{ + int i; + double r[20],dev,dmax; + + r[0]=(*sp).t-(*sf).t; + r[1]=(*sp).d-(*sf).d; + + for (i=0;i<3;i++) + { + r[6*i+2]=(*sg).p[i].re-(*sf).p[i].re; + r[6*i+3]=(*sg).p[i].im-(*sf).p[i].im; + + r[6*i+4]=(*sg).pt[i].re-(*sf).pt[i].re; + r[6*i+5]=(*sg).pt[i].im-(*sf).pt[i].im; + + r[6*i+6]=(*sg).pd[i].re-(*sf).pd[i].re; + r[6*i+7]=(*sg).pd[i].im-(*sf).pd[i].im; + } + + dmax=0.0; + + for (i=0;i<20;i++) + { + dev=fabs(r[i]); + if (dev>dmax) + dmax=dev; + } + + return dmax; +} + + +static double dev_sf(void) +{ + int i; + double r[38],dev,dmax; + ch_drv2_t *sf1,*sf2; + + sf1=sf; + sf2=sf+1; + + r[0]=(*sf1).t-(*sf2).t; + r[1]=(*sf1).d-(*sf2).d; + + for (i=0;i<3;i++) + { + r[12*i+2]=(*sf1).p[i].re-(*sf2).p[i].re; + r[12*i+3]=(*sf1).p[i].im-(*sf2).p[i].im; + + r[12*i+4]=(*sf1).pt[i].re-(*sf2).pt[i].re; + r[12*i+5]=(*sf1).pt[i].im-(*sf2).pt[i].im; + + r[12*i+6]=(*sf1).pd[i].re-(*sf2).pd[i].re; + r[12*i+7]=(*sf1).pd[i].im-(*sf2).pd[i].im; + + r[12*i+8]=(*sf1).ptt[i].re-(*sf2).ptt[i].re; + r[12*i+9]=(*sf1).ptt[i].im-(*sf2).ptt[i].im; + + r[12*i+10]=(*sf1).ptd[i].re-(*sf2).ptd[i].re; + r[12*i+11]=(*sf1).ptd[i].im-(*sf2).ptd[i].im; + + r[12*i+12]=(*sf1).pdd[i].re-(*sf2).pdd[i].re; + r[12*i+13]=(*sf1).pdd[i].im-(*sf2).pdd[i].im; + } + + dmax=0.0; + + for (i=0;i<38;i++) + { + dev=fabs(r[i]); + if (dev>dmax) + dmax=dev; + } + + return dmax; +} + + +int main(void) +{ + int i; + double dev,dmax1,dmax2,dmax3; + + printf("\n"); + printf("Invariance of chexp_drv2() under rotations of X\n"); + printf("-----------------------------------------------\n\n"); + + printf("Test performed on %d random matrices X\n\n",NTEST); + + rlxd_init(1,SEED); + alloc_Xu(); + + dmax1=0.0; + dmax2=0.0; + dmax3=0.0; + + for (i=0;idmax1) + dmax1=dev; + + dev=dev_sg(); + if (dev>dmax2) + dmax2=dev; + + random_su3_dble(r); + rotate_su3alg(r,X); + chexp_drv2(X,sf+1); + + dev=dev_sf(); + if (dev>dmax3) + dmax3=dev; + } + + printf ("Comparision of chexp_drv0 and chexp_drv2 = %.1e\n",dmax1); + printf ("Comparision of chexp_drv1 and chexp_drv2 = %.1e\n",dmax2); + printf ("Rotation invariance of chexp_drv2 = %.1e\n\n",dmax3); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check5.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check5.c new file mode 100644 index 0000000000000000000000000000000000000000..18bc3cc19ee49dfca5c4cf8096c4c65c62124623 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check5.c @@ -0,0 +1,387 @@ + +/******************************************************************************* +* +* File check5.c +* +* Copyright (C) 2009, 2011 Filippo Palombi, Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of chexp_drv2() in the case of diagonal X +* +* This program verifies that eqs. (4.1)-(4.6) of the notes "SU(3) matrix +* functions" are satisfied by the coefficients obtained by chexp_drv2() +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "utils.h" +#include "random.h" +#include "su3fcts.h" + +#define NTEST 100000 +#define SEED 8923 + +static double mu[3],xt[2],yt[2]; +static complex_dble t[2][3],x[3],y[3]; +static complex_dble xsq[3],stx[2][3],stt[2][2][3]; +static complex_dble ex[3],dex[2][3],ddex[2][2][3]; +static complex_dble df[2][3],ddf[2][2][3]; +static su3_alg_dble *X; +static ch_drv2_t *sf; + + +static void mul_vec(complex_dble *a,complex_dble *b,complex_dble *c) +{ + int i; + + for (i=0;i<3;i++) + { + c[i].re=a[i].re*b[i].re-a[i].im*b[i].im; + c[i].im=a[i].re*b[i].im+a[i].im*b[i].re; + } +} + + +static void add_vec(complex_dble z,complex_dble *a,complex_dble *b) +{ + int i; + + for (i=0;i<3;i++) + { + b[i].re+=(z.re*a[i].re-z.im*a[i].im); + b[i].im+=(z.re*a[i].im+z.im*a[i].re); + } +} + + +static void alloc_X(void) +{ + X=amalloc(1*sizeof(*X),4); + sf=amalloc(1*sizeof(*sf),4); + + error((X==NULL)||(sf==NULL),1, + "alloc_X [check5.c]","Unable to allocate matrices"); +} + + +static void set_tk(void) +{ + double r; + + t[0][0].re=0.0; + t[0][0].im=0.5; + t[0][1].re=0.0; + t[0][1].im=-0.5; + t[0][2].re=0.0; + t[0][2].im=0.0; + + r=1.0/(2.0*sqrt(3.0)); + + t[1][0].re=0.0; + t[1][0].im=r; + t[1][1].re=0.0; + t[1][1].im=r; + t[1][2].re=0.0; + t[1][2].im=-2.0*r; +} + + +static void random_X(void) +{ + double s; + + for (;;) + { + ranlxd(mu,2); + mu[0]=2.0*mu[0]-1.0; + mu[1]=2.0*mu[1]-1.0; + mu[2]=-mu[0]-mu[1]; + + if (fabs(mu[2])<=1.0) + break; + } + + (*X).c1=(mu[0]-mu[1])/3.0; + (*X).c2=(mu[0]-mu[2])/3.0; + (*X).c3=0.0; + (*X).c4=0.0; + (*X).c5=0.0; + (*X).c6=0.0; + (*X).c7=0.0; + (*X).c8=0.0; + + x[0].re=0.0; + x[0].im=mu[0]; + x[1].re=0.0; + x[1].im=mu[1]; + x[2].re=0.0; + x[2].im=mu[2]; + + xt[0]=x[0].im-x[1].im; + xt[1]=sqrt(3.0)*(x[0].im+x[1].im); + + s=(mu[0]*mu[0]+mu[1]*mu[1]+mu[2]*mu[2])/3.0; + + y[0].re=0.0; + y[0].im=mu[0]*mu[0]-s; + y[1].re=0.0; + y[1].im=mu[1]*mu[1]-s; + y[2].re=0.0; + y[2].im=mu[2]*mu[2]-s; + + yt[0]=y[0].im-y[1].im; + yt[1]=sqrt(3.0)*(y[0].im+y[1].im); + + ex[0].re=cos(mu[0]); + ex[0].im=sin(mu[0]); + ex[1].re=cos(mu[1]); + ex[1].im=sin(mu[1]); + ex[2].re=cos(mu[2]); + ex[2].im=sin(mu[2]); +} + + +static void diff_exp(void) +{ + int i,j; + + for (i=0;i<2;i++) + mul_vec(t[i],ex,dex[i]); + + for (i=0;i<2;i++) + { + for (j=0;j<2;j++) + mul_vec(t[i],dex[j],ddex[i][j]); + } +} + + +static void diff_fk(void) +{ + int i,j,k; + double d; + + for (i=0;i<2;i++) + { + for (k=0;k<3;k++) + { + df[i][k].re=0.5*(xt[i]*(*sf).pt[k].re+yt[i]*(*sf).pd[k].re); + df[i][k].im=0.5*(xt[i]*(*sf).pt[k].im+yt[i]*(*sf).pd[k].im); + } + } + + d=1.0/sqrt(3.0); + + for (k=0;k<3;k++) + { + ddf[0][0][k].re=0.5*((*sf).pt[k].re+d*xt[1]*(*sf).pd[k].re); + ddf[0][0][k].im=0.5*((*sf).pt[k].im+d*xt[1]*(*sf).pd[k].im); + + ddf[1][1][k].re=0.5*((*sf).pt[k].re-d*xt[1]*(*sf).pd[k].re); + ddf[1][1][k].im=0.5*((*sf).pt[k].im-d*xt[1]*(*sf).pd[k].im); + + ddf[0][1][k].re=0.5*d*xt[0]*(*sf).pd[k].re; + ddf[0][1][k].im=0.5*d*xt[0]*(*sf).pd[k].im; + + ddf[1][0][k].re=0.5*d*xt[0]*(*sf).pd[k].re; + ddf[1][0][k].im=0.5*d*xt[0]*(*sf).pd[k].im; + } + + for (i=0;i<2;i++) + { + for (j=0;j<2;j++) + { + for (k=0;k<3;k++) + { + ddf[i][j][k].re+=0.25*(xt[i]*xt[j]*(*sf).ptt[k].re+ + xt[i]*yt[j]*(*sf).ptd[k].re+ + yt[i]*xt[j]*(*sf).ptd[k].re+ + yt[i]*yt[j]*(*sf).pdd[k].re); + + ddf[i][j][k].im+=0.25*(xt[i]*xt[j]*(*sf).ptt[k].im+ + xt[i]*yt[j]*(*sf).ptd[k].im+ + yt[i]*xt[j]*(*sf).ptd[k].im+ + yt[i]*yt[j]*(*sf).pdd[k].im); + } + } + } +} + + +static void set_prods(void) +{ + int i,j; + + mul_vec(x,x,xsq); + + for (i=0;i<2;i++) + { + mul_vec(t[i],x,stx[i]); + + for (j=0;j<2;j++) + mul_vec(t[i],t[j],stt[i][j]); + } +} + + +static void subtract_chexp(void) +{ + int i,j,k; + complex_dble z; + + for (i=0;i<2;i++) + { + for (k=0;k<3;k++) + { + dex[i][k].re-=df[i][0].re; + dex[i][k].im-=df[i][0].im; + } + + z.re=-df[i][1].re; + z.im=-df[i][1].im; + add_vec(z,x,dex[i]); + + z.re=-df[i][2].re; + z.im=-df[i][2].im; + add_vec(z,xsq,dex[i]); + + z.re=-(*sf).p[1].re; + z.im=-(*sf).p[1].im; + add_vec(z,t[i],dex[i]); + + z.re=-2.0*(*sf).p[2].re; + z.im=-2.0*(*sf).p[2].im; + add_vec(z,stx[i],dex[i]); + + for (j=0;j<2;j++) + { + for (k=0;k<3;k++) + { + ddex[i][j][k].re-=ddf[i][j][0].re; + ddex[i][j][k].im-=ddf[i][j][0].im; + } + + z.re=-ddf[i][j][1].re; + z.im=-ddf[i][j][1].im; + add_vec(z,x,ddex[i][j]); + + z.re=-ddf[i][j][2].re; + z.im=-ddf[i][j][2].im; + add_vec(z,xsq,ddex[i][j]); + + z.re=-df[i][1].re; + z.im=-df[i][1].im; + add_vec(z,t[j],ddex[i][j]); + + z.re=-df[j][1].re; + z.im=-df[j][1].im; + add_vec(z,t[i],ddex[i][j]); + + z.re=-2.0*df[i][2].re; + z.im=-2.0*df[i][2].im; + add_vec(z,stx[j],ddex[i][j]); + + z.re=-2.0*df[j][2].re; + z.im=-2.0*df[j][2].im; + add_vec(z,stx[i],ddex[i][j]); + + z.re=-2.0*(*sf).p[2].re; + z.im=-2.0*(*sf).p[2].im; + add_vec(z,stt[i][j],ddex[i][j]); + } + } +} + + +static double dev_dex(void) +{ + int i,k; + double dev,dmax; + + dmax=0.0; + + for (i=0;i<2;i++) + { + for (k=0;k<3;k++) + { + dev=dex[i][k].re*dex[i][k].re+dex[i][k].im*dex[i][k].im; + if (dev>dmax) + dmax=dev; + } + } + + return sqrt(dmax); +} + + +static double dev_ddex(void) +{ + int i,j,k; + double dev,dmax; + + dmax=0.0; + + for (i=0;i<2;i++) + { + for (j=0;j<2;j++) + { + for (k=0;k<3;k++) + { + dev=ddex[i][j][k].re*ddex[i][j][k].re+ + ddex[i][j][k].im*ddex[i][j][k].im; + if (dev>dmax) + dmax=dev; + } + } + } + + return sqrt(dmax); +} + + +int main(void) +{ + int i; + double dev,dmax1,dmax2; + + printf("\n"); + printf("Check of chexp_drv2() for diagonal X\n"); + printf("------------------------------------\n\n"); + + printf("Test performed on %d random matrices X\n\n",NTEST); + + rlxd_init(1,SEED); + alloc_X(); + set_tk(); + + dmax1=0.0; + dmax2=0.0; + + for (i=0;idmax1) + dmax1=dev; + + dev=dev_ddex(); + if (dev>dmax2) + dmax2=dev; + } + + printf ("Maximal deviation of 1st derivatives = %.1e\n",dmax1); + printf ("Maximal deviation of 2nd derivatives = %.1e\n\n",dmax2); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check6.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check6.c new file mode 100644 index 0000000000000000000000000000000000000000..f9870d9cfb55bc80d5b94b1afc97f2746da40283 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check6.c @@ -0,0 +1,180 @@ + +/******************************************************************************* +* +* File check6.c +* +* Copyright (C) 2009, 2011 Filippo Palombi, Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Verifies that exp(X) is in SU(3) and that exp(X)*exp(-X)=1 +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "utils.h" +#include "random.h" +#include "su3fcts.h" + +#define NTEST 100000 +#define SEED 1234 + +static double mu[3]; +static su3_alg_dble *X; +static su3_dble *r,*u,*v,*w; +static const su3_dble u0={{0.0}}; +static ch_drv0_t *sp; + + +static void alloc_Xu(void) +{ + X=amalloc(1*sizeof(*X),4); + r=amalloc(4*sizeof(*r),4); + sp=amalloc(1*sizeof(*sp),4); + + error((X==NULL)||(r==NULL)||(sp==NULL),1, + "alloc_Xu [check6.c]","Unable to allocate matrices"); + + u=r+1; + v=r+2; + w=r+3; +} + + +static void random_Xu(void) +{ + for (;;) + { + ranlxd(mu,2); + mu[0]=2.0*mu[0]-1.0; + mu[1]=2.0*mu[1]-1.0; + mu[2]=-mu[0]-mu[1]; + + if (fabs(mu[2])<=1.0) + break; + } + + (*u)=u0; + (*u).c11.im=mu[0]; + (*u).c22.im=mu[1]; + (*u).c33.im=mu[2]; + + random_su3_dble(r); + su3xsu3(r,u,w); + su3xsu3dag(w,r,u); + + (*X).c1=((*u).c11.im-(*u).c22.im)/3.0; + (*X).c2=((*u).c11.im-(*u).c33.im)/3.0; + (*X).c3=(*u).c12.re; + (*X).c4=(*u).c12.im; + (*X).c5=(*u).c13.re; + (*X).c6=(*u).c13.im; + (*X).c7=(*u).c23.re; + (*X).c8=(*u).c23.im; +} + + +static void flip_signX(void) +{ + (*X).c1=-(*X).c1; + (*X).c2=-(*X).c2; + (*X).c3=-(*X).c3; + (*X).c4=-(*X).c4; + (*X).c5=-(*X).c5; + (*X).c6=-(*X).c6; + (*X).c7=-(*X).c7; + (*X).c8=-(*X).c8; +} + + +static double dev_uv(void) +{ + int i; + double r[18],dev,dmax; + + r[ 0]=(*u).c11.re-(*v).c11.re; + r[ 1]=(*u).c11.im-(*v).c11.im; + r[ 2]=(*u).c12.re-(*v).c12.re; + r[ 3]=(*u).c12.im-(*v).c12.im; + r[ 4]=(*u).c13.re-(*v).c13.re; + r[ 5]=(*u).c13.im-(*v).c13.im; + + r[ 6]=(*u).c21.re-(*v).c21.re; + r[ 7]=(*u).c21.im-(*v).c21.im; + r[ 8]=(*u).c22.re-(*v).c22.re; + r[ 9]=(*u).c22.im-(*v).c22.im; + r[10]=(*u).c23.re-(*v).c23.re; + r[11]=(*u).c23.im-(*v).c23.im; + + r[12]=(*u).c31.re-(*v).c31.re; + r[13]=(*u).c31.im-(*v).c31.im; + r[14]=(*u).c32.re-(*v).c32.re; + r[15]=(*u).c32.im-(*v).c32.im; + r[16]=(*u).c33.re-(*v).c33.re; + r[17]=(*u).c33.im-(*v).c33.im; + + dmax=0.0; + + for (i=0;i<18;i++) + { + dev=fabs(r[i]); + if (dev>dmax) + dmax=dev; + } + + return dmax; +} + + +int main(void) +{ + int i; + double dev,dmax1,dmax2; + + printf("\n"); + printf("Simple checks of exp(X) as calculated by chexp_drv0()\n"); + printf("-----------------------------------------------------\n\n"); + + printf("Test performed on %d random matrices X\n",NTEST); + + rlxd_init(1,SEED); + alloc_Xu(); + + dmax1=0.0; + dmax2=0.0; + + for (i=0;idmax1) + dmax1=dev; + + flip_signX(); + chexp_drv0(X,sp); + ch2mat((*sp).p,X,w); + su3xsu3(u,w,v); + (*u)=u0; + (*u).c11.re=1.0; + (*u).c22.re=1.0; + (*u).c33.re=1.0; + + dev=dev_uv(); + if (dev>dmax2) + dmax2=dev; + } + + printf ("Maximal deviation of exp(X) from SU(3) = %.1e\n",dmax1); + printf ("Maximal deviation of exp(X)*exp(-X) from 1 = %.1e\n\n",dmax2); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check7.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check7.c new file mode 100644 index 0000000000000000000000000000000000000000..afff5a3a1285506e3e2410e3a13984bb45ad577a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check7.c @@ -0,0 +1,170 @@ + +/******************************************************************************* +* +* File check7.c +* +* Copyright (C) 2009, 2011 Martin Luescher, Filippo Palombi +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of expXsu3() using the spectral representation of X +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "utils.h" +#include "random.h" +#include "su3fcts.h" + +#define NTEST 50000 +#define SEED 38579 + +static double mu[3],t,d,eps; +static su3_alg_dble *X; +static su3_dble *r,*u,*w,*y,*z; +static const su3_dble u0={{0.0}}; + + +static void alloc_Xu(void) +{ + X=amalloc(1*sizeof(*X),4); + r=amalloc(5*sizeof(*r),4); + + error((X==NULL)||(r==NULL),1, + "alloc_Xu [check7.c]","Unable to allocate matrices"); + + u=r+1; + w=r+2; + y=r+3; + z=r+4; +} + + +static void random_Xu(void) +{ + for (;;) + { + ranlxd(mu,2); + mu[0]=2.0*mu[0]-1.0; + mu[1]=2.0*mu[1]-1.0; + mu[2]=-mu[0]-mu[1]; + + if (fabs(mu[2])<=1.0) + break; + } + + t=0.5*(mu[0]*mu[0]+mu[1]*mu[1]+mu[2]*mu[2]); + d=mu[0]*mu[1]*mu[2]; + + (*u)=u0; + (*u).c11.im=mu[0]; + (*u).c22.im=mu[1]; + (*u).c33.im=mu[2]; + + random_su3_dble(r); + su3xsu3(r,u,w); + su3xsu3dag(w,r,u); + + (*X).c1=((*u).c11.im-(*u).c22.im)/3.0; + (*X).c2=((*u).c11.im-(*u).c33.im)/3.0; + (*X).c3=(*u).c12.re; + (*X).c4=(*u).c12.im; + (*X).c5=(*u).c13.re; + (*X).c6=(*u).c13.im; + (*X).c7=(*u).c23.re; + (*X).c8=(*u).c23.im; + + ranlxd(&eps,1); + eps*=20.0; + + (*u)=u0; + (*u).c11.re=cos(eps*mu[0]); + (*u).c22.re=cos(eps*mu[1]); + (*u).c33.re=cos(eps*mu[2]); + (*u).c11.im=sin(eps*mu[0]); + (*u).c22.im=sin(eps*mu[1]); + (*u).c33.im=sin(eps*mu[2]); + + su3xsu3(r,u,w); + su3xsu3dag(w,r,u); + + random_su3_dble(z); + su3xsu3(u,z,w); +} + + +static double dev_yw(void) +{ + int i; + double r[18],dev,dmax; + + r[ 0]=(*y).c11.re-(*w).c11.re; + r[ 1]=(*y).c11.im-(*w).c11.im; + r[ 2]=(*y).c12.re-(*w).c12.re; + r[ 3]=(*y).c12.im-(*w).c12.im; + r[ 4]=(*y).c13.re-(*w).c13.re; + r[ 5]=(*y).c13.im-(*w).c13.im; + + r[ 6]=(*y).c21.re-(*w).c21.re; + r[ 7]=(*y).c21.im-(*w).c21.im; + r[ 8]=(*y).c22.re-(*w).c22.re; + r[ 9]=(*y).c22.im-(*w).c22.im; + r[10]=(*y).c23.re-(*w).c23.re; + r[11]=(*y).c23.im-(*w).c23.im; + + r[12]=(*y).c31.re-(*w).c31.re; + r[13]=(*y).c31.im-(*w).c31.im; + r[14]=(*y).c32.re-(*w).c32.re; + r[15]=(*y).c32.im-(*w).c32.im; + r[16]=(*y).c33.re-(*w).c33.re; + r[17]=(*y).c33.im-(*w).c33.im; + + dmax=0.0; + + for (i=0;i<18;i++) + { + dev=fabs(r[i]); + if (dev>dmax) + dmax=dev; + } + + return dmax; +} + + +int main(void) +{ + int i; + double dev,dmax; + + printf("\n"); + printf("Check of expXsu3()\n"); + printf("------------------\n\n"); + + printf("Test performed on %d random matrices X and u using the\n",NTEST); + printf("spectral representation of X\n\n"); + + rlxd_init(1,SEED); + alloc_Xu(); + + dmax=0.0; + + for (i=0;idmax) + dmax=dev; + } + + printf ("Maximal deviation of exp(X)*u = %.1e\n\n",dmax); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check8.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check8.c new file mode 100644 index 0000000000000000000000000000000000000000..11b3e64428a4cb065e3f18fd526d6cf6c04d7d12 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check8.c @@ -0,0 +1,558 @@ + +/******************************************************************************* +* +* File check8.c +* +* Copyright (C) 2009, 2010, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the programs cm3x3_zero(),...,cm3x3_lc2() +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "utils.h" +#include "random.h" +#include "su3fcts.h" + +#define NTEST 1000 +#define SEED 376 + +static double rs,*rus,*rvs,*rws,*rzs; +static complex_dble *cs; +static su3_dble *us,*vs,*ws,*zs; +static su3_dble ud0={{0.0}}; +static complex_dble trs ALIGNED16; + + +static void alloc_matrices(void) +{ + rus=amalloc(90*sizeof(*rus),4); + cs=amalloc(3*sizeof(*cs),4); + us=amalloc(5*sizeof(*us),4); + + error((rus==NULL)||(cs==NULL)||(us==NULL),1, + "alloc_matrices [check8.c]","Unable to allocate matrices"); + + rvs=rus+36; + rws=rvs+18; + rzs=rws+18; + + vs=us+2; + ws=vs+1; + zs=ws+1; +} + + +static void mat2vec(su3_dble *u,double *ru) +{ + ru[ 0]=(*u).c11.re; + ru[ 1]=(*u).c11.im; + ru[ 2]=(*u).c12.re; + ru[ 3]=(*u).c12.im; + ru[ 4]=(*u).c13.re; + ru[ 5]=(*u).c13.im; + + ru[ 6]=(*u).c21.re; + ru[ 7]=(*u).c21.im; + ru[ 8]=(*u).c22.re; + ru[ 9]=(*u).c22.im; + ru[10]=(*u).c23.re; + ru[11]=(*u).c23.im; + + ru[12]=(*u).c31.re; + ru[13]=(*u).c31.im; + ru[14]=(*u).c32.re; + ru[15]=(*u).c32.im; + ru[16]=(*u).c33.re; + ru[17]=(*u).c33.im; +} + + +static void vec2mat(double *ru,su3_dble *u) +{ + (*u).c11.re=ru[ 0]; + (*u).c11.im=ru[ 1]; + (*u).c12.re=ru[ 2]; + (*u).c12.im=ru[ 3]; + (*u).c13.re=ru[ 4]; + (*u).c13.im=ru[ 5]; + + (*u).c21.re=ru[ 6]; + (*u).c21.im=ru[ 7]; + (*u).c22.re=ru[ 8]; + (*u).c22.im=ru[ 9]; + (*u).c23.re=ru[10]; + (*u).c23.im=ru[11]; + + (*u).c31.re=ru[12]; + (*u).c31.im=ru[13]; + (*u).c32.re=ru[14]; + (*u).c32.im=ru[15]; + (*u).c33.re=ru[16]; + (*u).c33.im=ru[17]; +} + + +static void add_vec(double *ru,double *rv,double *rw) +{ + int i; + + for (i=0;i<18;i++) + rw[i]=ru[i]+rv[i]; +} + + +static void mulr_vec(double r,double *ru,double *rv) +{ + int i; + + for (i=0;i<18;i++) + rv[i]=r*ru[i]; +} + + +static void mulc_vec(complex_dble c,double *ru,double *rv) +{ + int i; + + for (i=0;i<18;i+=2) + { + rv[i ]=c.re*ru[i ]-c.im*ru[i+1]; + rv[i+1]=c.re*ru[i+1]+c.im*ru[i ]; + } +} + + +static void dag_vec(double *ru,double *rv) +{ + int i,j; + + for (i=0;i<3;i++) + { + for (j=0;j<3;j++) + { + rv[6*i+2*j ]= ru[6*j+2*i ]; + rv[6*i+2*j+1]=-ru[6*j+2*i+1]; + } + } +} + + +static void random_matrix(su3_dble *u) +{ + int i; + double r[18]; + + ranlxd(r,18); + + for (i=0;i<18;i++) + r[i]=2.0*r[i]-1.0; + + vec2mat(r,u); +} + + +static void start_test(void) +{ + int i; + double r[6]; + + ranlxd(&rs,1); + rs=2.0*rs-1.0; + + ranlxd(r,6); + + for (i=0;i<6;i++) + r[i]=2.0*r[i]-1.0; + + cs[0].re=r[0]; + cs[0].im=r[1]; + cs[1].re=r[2]; + cs[1].im=r[3]; + cs[2].re=r[4]; + cs[2].im=r[5]; + + random_matrix(us); + random_matrix(us+1); + random_matrix(vs); + random_matrix(ws); + random_matrix(zs); +} + + +static double dev_uv(su3_dble *u,su3_dble *v) +{ + int i; + double r[18],dev,dmax; + + r[ 0]=(*u).c11.re-(*v).c11.re; + r[ 1]=(*u).c11.im-(*v).c11.im; + r[ 2]=(*u).c12.re-(*v).c12.re; + r[ 3]=(*u).c12.im-(*v).c12.im; + r[ 4]=(*u).c13.re-(*v).c13.re; + r[ 5]=(*u).c13.im-(*v).c13.im; + + r[ 6]=(*u).c21.re-(*v).c21.re; + r[ 7]=(*u).c21.im-(*v).c21.im; + r[ 8]=(*u).c22.re-(*v).c22.re; + r[ 9]=(*u).c22.im-(*v).c22.im; + r[10]=(*u).c23.re-(*v).c23.re; + r[11]=(*u).c23.im-(*v).c23.im; + + r[12]=(*u).c31.re-(*v).c31.re; + r[13]=(*u).c31.im-(*v).c31.im; + r[14]=(*u).c32.re-(*v).c32.re; + r[15]=(*u).c32.im-(*v).c32.im; + r[16]=(*u).c33.re-(*v).c33.re; + r[17]=(*u).c33.im-(*v).c33.im; + + dmax=0.0; + + for (i=0;i<18;i++) + { + dev=fabs(r[i]); + if (dev>dmax) + dmax=dev; + } + + return dmax; +} + + +int main(void) +{ + int i; + double dev,dmax[15]; + + printf("\n"); + printf("Check of the programs cm3x3_zero(),...,cm3x3_lc2()\n"); + printf("--------------------------------------------------\n\n"); + + printf("Test performed on %d random matrices\n\n",NTEST); + + rlxd_init(1,SEED); + alloc_matrices(); + + for (i=0;i<15;i++) + dmax[i]=0.0; + + for (i=0;idmax[0]) + dmax[0]=dev; + + cm3x3_unity(1,vs); + (*vs).c11.re-=1.0; + (*vs).c22.re-=1.0; + (*vs).c33.re-=1.0; + dev=dev_uv(&ud0,vs); + vec2mat(rvs,vs); + if (dev>dmax[1]) + dmax[1]=dev; + + mat2vec(us,rus); + mat2vec(ws,rws); + vec2mat(rus,ws); + cm3x3_assign(1,us,vs); + dev=dev_uv(ws,vs); + if (dev>dmax[2]) + dmax[2]=dev; + dev=dev_uv(ws,us); + if (dev>dmax[2]) + dmax[2]=dev; + vec2mat(rvs,vs); + vec2mat(rws,ws); + + mat2vec(us,rus); + mat2vec(vs,rvs); + mat2vec(ws,rws); + cm3x3_swap(1,us,vs); + vec2mat(rus,ws); + dev=dev_uv(ws,vs); + if (dev>dmax[14]) + dmax[14]=dev; + vec2mat(rvs,ws); + dev=dev_uv(ws,us); + if (dev>dmax[14]) + dmax[14]=dev; + vec2mat(rus,us); + vec2mat(rvs,vs); + vec2mat(rws,ws); + + mat2vec(us,rus); + dag_vec(rus,rzs); + vec2mat(rzs,ws); + cm3x3_dagger(us,vs); + dev=dev_uv(vs,ws); + if (dev>dmax[3]) + dmax[3]=dev; + vec2mat(rus,ws); + dev=dev_uv(us,ws); + if (dev>dmax[3]) + dmax[3]=dev; + + vec2mat(rzs,ws); + cm3x3_dagger(us,us); + dev=dev_uv(us,ws); + if (dev>dmax[3]) + dmax[3]=dev; + vec2mat(rus,us); + vec2mat(rvs,vs); + vec2mat(rws,ws); + + cm3x3_tr(us,us+1,&trs); + su3xsu3(us,us+1,vs); + trs.re-=((*vs).c11.re+(*vs).c22.re+(*vs).c33.re); + trs.im-=((*vs).c11.im+(*vs).c22.im+(*vs).c33.im); + dev=fabs(trs.re)+fabs(trs.im); + vec2mat(rvs,vs); + if (dev>dmax[4]) + dmax[4]=dev; + + cm3x3_tr(us,us,&trs); + su3xsu3(us,us,vs); + trs.re-=((*vs).c11.re+(*vs).c22.re+(*vs).c33.re); + trs.im-=((*vs).c11.im+(*vs).c22.im+(*vs).c33.im); + dev=fabs(trs.re)+fabs(trs.im); + vec2mat(rvs,vs); + if (dev>dmax[4]) + dmax[4]=dev; + + cm3x3_retr(us,us+1,&trs.re); + su3xsu3(us,us+1,vs); + trs.re-=((*vs).c11.re+(*vs).c22.re+(*vs).c33.re); + dev=fabs(trs.re); + vec2mat(rvs,vs); + if (dev>dmax[5]) + dmax[5]=dev; + + cm3x3_retr(us,us,&trs.re); + su3xsu3(us,us,vs); + trs.re-=((*vs).c11.re+(*vs).c22.re+(*vs).c33.re); + dev=fabs(trs.re); + vec2mat(rvs,vs); + if (dev>dmax[5]) + dmax[5]=dev; + + mat2vec(us,rus); + mat2vec(vs,rvs); + mat2vec(zs,rzs); + add_vec(rus,rvs,rws); + vec2mat(rws,zs); + cm3x3_add(us,vs); + dev=dev_uv(vs,zs); + vec2mat(rvs,vs); + vec2mat(rzs,zs); + if (dev>dmax[6]) + dmax[6]=dev; + + mat2vec(us,rvs); + add_vec(rus,rvs,rws); + vec2mat(rws,zs); + cm3x3_add(us,us); + dev=dev_uv(us,zs); + vec2mat(rus,us); + vec2mat(rzs,zs); + if (dev>dmax[6]) + dmax[6]=dev; + + mat2vec(ws,rws); + mat2vec(zs,rzs); + su3xsu3(us,vs,zs); + mat2vec(zs,rvs); + add_vec(rvs,rws,rus); + vec2mat(rus,zs); + cm3x3_mul_add(us,vs,ws); + vec2mat(rws,ws); + cm3x3_mul_add(us,vs,ws); + dev=dev_uv(ws,zs); + vec2mat(rws,ws); + vec2mat(rzs,zs); + if (dev>dmax[7]) + dmax[7]=dev; + + mat2vec(vs,rvs); + mat2vec(zs,rzs); + su3xsu3(us,vs,zs); + mat2vec(zs,rws); + add_vec(rws,rvs,rus); + vec2mat(rus,zs); + cm3x3_mul_add(us,vs,vs); + vec2mat(rvs,vs); + cm3x3_mul_add(us,vs,vs); + dev=dev_uv(vs,zs); + vec2mat(rvs,vs); + vec2mat(rzs,zs); + if (dev>dmax[7]) + dmax[7]=dev; + + mat2vec(ws,rws); + mat2vec(zs,rzs); + mat2vec(us,rus); + mulr_vec(rs,rus,rvs); + vec2mat(rvs,zs); + cm3x3_mulr(&rs,us,ws); + dev=dev_uv(ws,zs); + if (dev>dmax[8]) + dmax[8]=dev; + + cm3x3_mulr(&rs,us,us); + dev=dev_uv(us,zs); + vec2mat(rus,us); + vec2mat(rws,ws); + vec2mat(rzs,zs); + if (dev>dmax[8]) + dmax[8]=dev; + + mat2vec(us,rus); + mat2vec(vs,rvs); + mulr_vec(rs,rus,rws); + add_vec(rws,rvs,rzs); + mat2vec(ws,rws); + vec2mat(rzs,ws); + cm3x3_mulr_add(&rs,us,vs); + dev=dev_uv(vs,ws); + vec2mat(rvs,vs); + vec2mat(rws,ws); + if (dev>dmax[9]) + dmax[9]=dev; + + mulr_vec(rs,rus,rws); + add_vec(rus,rws,rvs); + mat2vec(ws,rws); + vec2mat(rvs,ws); + cm3x3_mulr_add(&rs,us,us); + dev=dev_uv(us,ws); + vec2mat(rus,us); + vec2mat(rws,ws); + if (dev>dmax[9]) + dmax[9]=dev; + + mat2vec(ws,rws); + mat2vec(zs,rzs); + mat2vec(us,rus); + mulc_vec(cs[0],rus,rvs); + vec2mat(rvs,zs); + cm3x3_mulc(cs,us,ws); + dev=dev_uv(ws,zs); + if (dev>dmax[10]) + dmax[10]=dev; + + cm3x3_mulc(cs,us,us); + dev=dev_uv(us,zs); + vec2mat(rus,us); + vec2mat(rws,ws); + vec2mat(rzs,zs); + if (dev>dmax[10]) + dmax[10]=dev; + + mat2vec(us,rus); + mat2vec(vs,rvs); + mulc_vec(cs[0],rus,rws); + add_vec(rws,rvs,rzs); + mat2vec(ws,rws); + vec2mat(rzs,ws); + cm3x3_mulc_add(cs,us,vs); + dev=dev_uv(vs,ws); + vec2mat(rvs,vs); + vec2mat(rws,ws); + if (dev>dmax[11]) + dmax[11]=dev; + + mulc_vec(cs[0],rus,rws); + add_vec(rus,rws,rvs); + mat2vec(ws,rws); + vec2mat(rvs,ws); + cm3x3_mulc_add(cs,us,us); + dev=dev_uv(us,ws); + vec2mat(rus,us); + vec2mat(rws,ws); + if (dev>dmax[11]) + dmax[11]=dev; + + mat2vec(us,rus); + mat2vec(vs,rvs); + mat2vec(ws,rws); + mulc_vec(cs[1],rus,rzs); + vec2mat(rzs,ws); + (*ws).c11.re+=cs[0].re; + (*ws).c11.im+=cs[0].im; + (*ws).c22.re+=cs[0].re; + (*ws).c22.im+=cs[0].im; + (*ws).c33.re+=cs[0].re; + (*ws).c33.im+=cs[0].im; + cm3x3_lc1(cs,us,vs); + dev=dev_uv(vs,ws); + if (dev>dmax[12]) + dmax[12]=dev; + + cm3x3_lc1(cs,us,us); + dev=dev_uv(us,ws); + vec2mat(rus,us); + vec2mat(rvs,vs); + vec2mat(rws,ws); + if (dev>dmax[12]) + dmax[12]=dev; + + mat2vec(us,rus); + mat2vec(us+1,rus+18); + mulc_vec(cs[1],rus,rvs); + mulc_vec(cs[2],rus+18,rws); + add_vec(rvs,rws,rzs); + vec2mat(rzs,ws); + (*ws).c11.re+=cs[0].re; + (*ws).c11.im+=cs[0].im; + (*ws).c22.re+=cs[0].re; + (*ws).c22.im+=cs[0].im; + (*ws).c33.re+=cs[0].re; + (*ws).c33.im+=cs[0].im; + cm3x3_lc2(cs,us,vs); + dev=dev_uv(vs,ws); + if (dev>dmax[13]) + dmax[13]=dev; + + cm3x3_lc2(cs,us,us); + vec2mat(rus,us); + cm3x3_lc2(cs,us,us); + dev=dev_uv(us,ws); + if (dev>dmax[13]) + dmax[13]=dev; + } + + printf("Maximal deviation of cm3x3_zero() = %.1e\n",dmax[0]); + printf("Maximal deviation of cm3x3_unity() = %.1e\n",dmax[1]); + printf("Maximal deviation of cm3x3_assign() = %.1e\n",dmax[2]); + printf("Maximal deviation of cm3x3_swap() = %.1e\n",dmax[14]); + printf("Maximal deviation of cm3x3_dagger() = %.1e\n",dmax[3]); + printf("Maximal deviation of cm3x3_tr() = %.1e\n",dmax[4]); + printf("Maximal deviation of cm3x3_retr() = %.1e\n",dmax[5]); + printf("Maximal deviation of cm3x3_add() = %.1e\n",dmax[6]); + printf("Maximal deviation of cm3x3_mul_add() = %.1e\n",dmax[7]); + printf("Maximal deviation of cm3x3_mulr() = %.1e\n",dmax[8]); + printf("Maximal deviation of cm3x3_mulr_add() = %.1e\n",dmax[9]); + printf("Maximal deviation of cm3x3_mulc() = %.1e\n",dmax[10]); + printf("Maximal deviation of cm3x3_mulc_add() = %.1e\n",dmax[11]); + printf("Maximal deviation of cm3x3_lc1() = %.1e\n",dmax[12]); + printf("Maximal deviation of cm3x3_lc2() = %.1e\n\n",dmax[13]); + + for (i=1;i<15;i++) + { + if (dmax[i]>dmax[0]) + dmax[0]=dmax[i]; + } + + printf("Maximal deviation (all tests) = %.1e\n\n",dmax[0]); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/time1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/time1.c new file mode 100644 index 0000000000000000000000000000000000000000..ef1ee2e14f8819d254752766eaf0952441064234 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/time1.c @@ -0,0 +1,385 @@ + +/******************************************************************************* +* +* File time1.c +* +* Copyright (C) 2005, 2008, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of the SU(3) x SU(3)-vector multiplication (single-precision programs) +* +*******************************************************************************/ + +#include +#include +#include +#include +#include "su3.h" +#include "random.h" +#include "su3fcts.h" + +static su3 u[4] ALIGNED16; +static su3_vector s[8],r[8],t[8] ALIGNED16; + +#if (defined x64) +#if (defined AVX) +#include "avx.h" + +#define _avx_vector_quartet_load(s) \ +__asm__ __volatile__ ("vmovaps %0, %%xmm6 \n\t" \ + "vmovaps %2, %%xmm7 \n\t" \ + "vmovaps %4, %%xmm8" \ + : \ + : \ + "m" ((s[0]).c1), \ + "m" ((s[0]).c2), \ + "m" ((s[0]).c3), \ + "m" ((s[1]).c1), \ + "m" ((s[1]).c2), \ + "m" ((s[1]).c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vinsertf128 $0x1, %0, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %2, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %4, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((s[2]).c1), \ + "m" ((s[2]).c2), \ + "m" ((s[2]).c3), \ + "m" ((s[3]).c1), \ + "m" ((s[3]).c2), \ + "m" ((s[3]).c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vshufps $0xe4, %%ymm7, %%ymm6, %%ymm0 \n\t" \ + "vshufps $0x4e, %%ymm8, %%ymm6, %%ymm1 \n\t" \ + "vshufps $0xe4, %%ymm8, %%ymm7, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +#define _avx_vector_quartet_store_up(r) \ +__asm__ __volatile__ ("vshufps $0x44, %%ymm4, %%ymm3, %%ymm9 \n\t" \ + "vshufps $0xe4, %%ymm3, %%ymm5, %%ymm10 \n\t" \ + "vshufps $0xee, %%ymm5, %%ymm4, %%ymm11" \ + : \ + : \ + : \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vmovaps %%xmm9, %0 \n\t" \ + "vmovaps %%xmm10, %2 \n\t" \ + "vmovaps %%xmm11, %4" \ + : \ + : \ + "m" ((r[0]).c1), \ + "m" ((r[0]).c2), \ + "m" ((r[0]).c3), \ + "m" ((r[1]).c1), \ + "m" ((r[1]).c2), \ + "m" ((r[1]).c3)); \ +__asm__ __volatile__ ("vextractf128 $0x1, %%ymm9, %0 \n\t" \ + "vextractf128 $0x1, %%ymm10, %2 \n\t" \ + "vextractf128 $0x1, %%ymm11, %4" \ + : \ + : \ + "m" ((r[2]).c1), \ + "m" ((r[2]).c2), \ + "m" ((r[2]).c3), \ + "m" ((r[3]).c1), \ + "m" ((r[3]).c2), \ + "m" ((r[3]).c3)) + + +static void fast_multiply(su3 *ua,su3_vector *sa,su3_vector *ra) +{ + _avx_vector_quartet_load(sa); + _avx_su3_pair_multiply(ua[0],ua[1]); + _avx_vector_quartet_store_up(ra); + ua+=2; + sa+=4; + ra+=4; + _avx_vector_quartet_load(sa); + _avx_su3_pair_multiply(ua[0],ua[1]); + _avx_vector_quartet_store_up(ra); +} + + +static void fast_inverse_multiply(su3 *ua,su3_vector *sa,su3_vector *ra) +{ + _avx_vector_quartet_load(sa); + _avx_su3_pair_inverse_multiply(ua[0],ua[1]); + _avx_vector_quartet_store_up(ra); + ua+=2; + sa+=4; + ra+=4; + _avx_vector_quartet_load(sa); + _avx_su3_pair_inverse_multiply(ua[0],ua[1]); + _avx_vector_quartet_store_up(ra); +} + + +static void fast_mixed_multiply(su3 *ua,su3_vector *sa,su3_vector *ra) +{ + _avx_vector_quartet_load(sa); + _avx_su3_pair_mixed_multiply(ua[0],ua[1]); + _avx_vector_quartet_store_up(ra); + ua+=2; + sa+=4; + ra+=4; + _avx_vector_quartet_load(sa); + _avx_su3_pair_mixed_multiply(ua[0],ua[1]); + _avx_vector_quartet_store_up(ra); +} + + +static void slow_mixed_multiply(su3 *ua,su3_vector *sa,su3_vector *ra) +{ + _su3_multiply((*(ra )),(*(ua )),(*(sa ))); + _su3_multiply((*(ra+1)),(*(ua )),(*(sa+1))); + _su3_inverse_multiply((*(ra+2)),(*(ua+1)),(*(sa+2))); + _su3_inverse_multiply((*(ra+3)),(*(ua+1)),(*(sa+3))); + _su3_multiply((*(ra+4)),(*(ua+2)),(*(sa+4))); + _su3_multiply((*(ra+5)),(*(ua+2)),(*(sa+5))); + _su3_inverse_multiply((*(ra+6)),(*(ua+3)),(*(sa+6))); + _su3_inverse_multiply((*(ra+7)),(*(ua+3)),(*(sa+7))); +} + +#else +#include "sse2.h" + +#define _su3_fast_multiply(r1,r2,u,s1,s2) \ + _sse_pair_load(s1,s2); \ + _sse_su3_multiply(u); \ + _sse_pair_store_up(r1,r2) + +#define _su3_fast_inverse_multiply(r1,r2,u,s1,s2) \ + _sse_pair_load(s1,s2); \ + _sse_su3_inverse_multiply(u); \ + _sse_pair_store_up(r1,r2) + + +static void fast_multiply(su3 *ua,su3_vector *sa,su3_vector *ra) +{ + _su3_fast_multiply((*(ra )),(*(ra+1)),(*(ua )),(*(sa )),(*(sa+1))); + _su3_fast_multiply((*(ra+2)),(*(ra+3)),(*(ua+1)),(*(sa+2)),(*(sa+3))); + _su3_fast_multiply((*(ra+4)),(*(ra+5)),(*(ua+2)),(*(sa+4)),(*(sa+5))); + _su3_fast_multiply((*(ra+6)),(*(ra+7)),(*(ua+3)),(*(sa+6)),(*(sa+7))); +} + + +static void fast_inverse_multiply(su3 *ua,su3_vector *sa,su3_vector *ra) +{ + _su3_fast_inverse_multiply((*(ra )),(*(ra+1)),(*(ua )),(*(sa )),(*(sa+1))); + _su3_fast_inverse_multiply((*(ra+2)),(*(ra+3)),(*(ua+1)),(*(sa+2)),(*(sa+3))); + _su3_fast_inverse_multiply((*(ra+4)),(*(ra+5)),(*(ua+2)),(*(sa+4)),(*(sa+5))); + _su3_fast_inverse_multiply((*(ra+6)),(*(ra+7)),(*(ua+3)),(*(sa+6)),(*(sa+7))); +} + +#endif + +static void slow_inverse_multiply(su3 *ua,su3_vector *sa,su3_vector *ra) +{ + _su3_inverse_multiply((*(ra )),(*(ua )),(*(sa ))); + _su3_inverse_multiply((*(ra+1)),(*(ua )),(*(sa+1))); + _su3_inverse_multiply((*(ra+2)),(*(ua+1)),(*(sa+2))); + _su3_inverse_multiply((*(ra+3)),(*(ua+1)),(*(sa+3))); + _su3_inverse_multiply((*(ra+4)),(*(ua+2)),(*(sa+4))); + _su3_inverse_multiply((*(ra+5)),(*(ua+2)),(*(sa+5))); + _su3_inverse_multiply((*(ra+6)),(*(ua+3)),(*(sa+6))); + _su3_inverse_multiply((*(ra+7)),(*(ua+3)),(*(sa+7))); +} + +#endif + +static void slow_multiply(su3 *ua,su3_vector *sa,su3_vector *ra) +{ + _su3_multiply((*(ra )),(*(ua )),(*(sa ))); + _su3_multiply((*(ra+1)),(*(ua )),(*(sa+1))); + _su3_multiply((*(ra+2)),(*(ua+1)),(*(sa+2))); + _su3_multiply((*(ra+3)),(*(ua+1)),(*(sa+3))); + _su3_multiply((*(ra+4)),(*(ua+2)),(*(sa+4))); + _su3_multiply((*(ra+5)),(*(ua+2)),(*(sa+5))); + _su3_multiply((*(ra+6)),(*(ua+3)),(*(sa+6))); + _su3_multiply((*(ra+7)),(*(ua+3)),(*(sa+7))); +} + + +int main(void) +{ + int k,n,count; + double t1,t2,dt; +#if (defined x64) + double delta,diff,norm; +#endif + + printf("\n"); + printf("Time per single-precision SU(3) x SU(3)-vector multiplication\n"); + printf("-------------------------------------------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX instructions\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n"); +#endif + + printf("Measurement made with all data in cache\n\n"); + + rlxs_init(0,123456); + + for (k=0;k<4;k++) + random_su3(u+k); + + gauss((float*)(s),48); + gauss((float*)(r),48); + gauss((float*)(t),48); + +#if (defined x64) + + n=(int)(1.0e6); + dt=0.0; + + while (dt<2.0) + { + t1=(double)clock(); + for (count=0;countdelta) + delta=diff; + } + +#if (defined AVX) + printf("||U*w_AVX-U*w_FPU||<= %.1e*||w||\n",delta); +#else + printf("||U*w_SSE-U*w_FPU||<= %.1e*||w||\n",delta); +#endif + + fast_inverse_multiply(u,s,r); + slow_inverse_multiply(u,s,t); + delta=0.0; + + for (k=0;k<8;k++) + { + _vector_sub_assign(r[k],t[k]); + diff=(double)(_vector_prod_re(r[k],r[k])); + norm=(double)(_vector_prod_re(s[k],s[k])); + diff=sqrt(diff/norm); + if (diff>delta) + delta=diff; + } + +#if (defined AVX) + printf("||U^dag*w_AVX-U^dag*w_FPU||<= %.1e*||w||\n",delta); +#else + printf("||U^dag*w_SSE-U^dag*w_FPU||<= %.1e*||w||\n",delta); +#endif + +#if (defined AVX) + + fast_mixed_multiply(u,s,r); + slow_mixed_multiply(u,s,t); + delta=0.0; + + for (k=0;k<8;k++) + { + _vector_sub_assign(r[k],t[k]); + diff=(double)(_vector_prod_re(r[k],r[k])); + norm=(double)(_vector_prod_re(s[k],s[k])); + diff=sqrt(diff/norm); + if (diff>delta) + delta=diff; + } + + printf("||U/U^dag*w_AVX-U/U^dag*w_FPU||<= %.1e*||w||\n",delta); + +#endif +#endif + + exit(0); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/time2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/time2.c new file mode 100644 index 0000000000000000000000000000000000000000..de177790645e8f7d061c1be2c24d2724bda10540 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/time2.c @@ -0,0 +1,263 @@ + +/******************************************************************************* +* +* File time2.c +* +* Copyright (C) 2005, 2008, 2009, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of the SU(3) x SU(3)-vector multiplication (double-precision programs) +* +*******************************************************************************/ + +#include +#include +#include +#include +#include "su3.h" +#include "random.h" +#include "su3fcts.h" + +static su3_dble u[4] ALIGNED16; +static su3_vector_dble s[8],r[8],t[8] ALIGNED16; + +#if (defined x64) +#if (defined AVX) +#include "avx.h" + +#define _su3_fast_multiply(r1,r2,u,s1,s2) \ + _avx_pair_load_dble(s1,s2); \ + _avx_su3_multiply_pair_dble(u); \ + _avx_pair_store_up_dble(r1,r2) + +#define _su3_fast_inverse_multiply(r1,r2,u,s1,s2) \ + _avx_pair_load_dble(s1,s2); \ + _avx_su3_inverse_multiply_pair_dble(u); \ + _avx_pair_store_up_dble(r1,r2) + +static void fast_multiply(su3_dble *ua,su3_vector_dble *sa, + su3_vector_dble *ra) +{ + _su3_fast_multiply((*(ra )),(*(ra+1)),(*(ua )),(*(sa )),(*(sa+1))); + _su3_fast_multiply((*(ra+2)),(*(ra+3)),(*(ua+1)),(*(sa+2)),(*(sa+3))); + _su3_fast_multiply((*(ra+4)),(*(ra+5)),(*(ua+2)),(*(sa+4)),(*(sa+5))); + _su3_fast_multiply((*(ra+6)),(*(ra+7)),(*(ua+3)),(*(sa+6)),(*(sa+7))); +} + + +static void fast_inverse_multiply(su3_dble *ua,su3_vector_dble *sa, + su3_vector_dble *ra) +{ + _su3_fast_inverse_multiply((*(ra )),(*(ra+1)),(*(ua )),(*(sa )),(*(sa+1))); + _su3_fast_inverse_multiply((*(ra+2)),(*(ra+3)),(*(ua+1)),(*(sa+2)),(*(sa+3))); + _su3_fast_inverse_multiply((*(ra+4)),(*(ra+5)),(*(ua+2)),(*(sa+4)),(*(sa+5))); + _su3_fast_inverse_multiply((*(ra+6)),(*(ra+7)),(*(ua+3)),(*(sa+6)),(*(sa+7))); +} + +#else +#include "sse2.h" + +#define _su3_fast_multiply(r,u,s) \ + _sse_load_dble(s); \ + _sse_su3_multiply_dble(u); \ + _sse_store_up_dble(r) + +#define _su3_fast_inverse_multiply(r,u,s) \ + _sse_load_dble(s); \ + _sse_su3_inverse_multiply_dble(u); \ + _sse_store_up_dble(r) + + +static void fast_multiply(su3_dble *ua,su3_vector_dble *sa, + su3_vector_dble *ra) +{ + _su3_fast_multiply((*(ra )),(*(ua )),(*(sa ))); + _su3_fast_multiply((*(ra+1)),(*(ua )),(*(sa+1))); + _su3_fast_multiply((*(ra+2)),(*(ua+1)),(*(sa+2))); + _su3_fast_multiply((*(ra+3)),(*(ua+1)),(*(sa+3))); + _su3_fast_multiply((*(ra+4)),(*(ua+2)),(*(sa+4))); + _su3_fast_multiply((*(ra+5)),(*(ua+2)),(*(sa+5))); + _su3_fast_multiply((*(ra+6)),(*(ua+3)),(*(sa+6))); + _su3_fast_multiply((*(ra+7)),(*(ua+3)),(*(sa+7))); +} + + +static void fast_inverse_multiply(su3_dble *ua,su3_vector_dble *sa, + su3_vector_dble *ra) +{ + _su3_fast_inverse_multiply((*(ra )),(*(ua )),(*(sa ))); + _su3_fast_inverse_multiply((*(ra+1)),(*(ua )),(*(sa+1))); + _su3_fast_inverse_multiply((*(ra+2)),(*(ua+1)),(*(sa+2))); + _su3_fast_inverse_multiply((*(ra+3)),(*(ua+1)),(*(sa+3))); + _su3_fast_inverse_multiply((*(ra+4)),(*(ua+2)),(*(sa+4))); + _su3_fast_inverse_multiply((*(ra+5)),(*(ua+2)),(*(sa+5))); + _su3_fast_inverse_multiply((*(ra+6)),(*(ua+3)),(*(sa+6))); + _su3_fast_inverse_multiply((*(ra+7)),(*(ua+3)),(*(sa+7))); +} + +#endif + +static void slow_inverse_multiply(su3_dble *ua,su3_vector_dble *sa, + su3_vector_dble *ra) +{ + _su3_inverse_multiply((*(ra )),(*(ua )),(*(sa ))); + _su3_inverse_multiply((*(ra+1)),(*(ua )),(*(sa+1))); + _su3_inverse_multiply((*(ra+2)),(*(ua+1)),(*(sa+2))); + _su3_inverse_multiply((*(ra+3)),(*(ua+1)),(*(sa+3))); + _su3_inverse_multiply((*(ra+4)),(*(ua+2)),(*(sa+4))); + _su3_inverse_multiply((*(ra+5)),(*(ua+2)),(*(sa+5))); + _su3_inverse_multiply((*(ra+6)),(*(ua+3)),(*(sa+6))); + _su3_inverse_multiply((*(ra+7)),(*(ua+3)),(*(sa+7))); +} + +#endif + +static void slow_multiply(su3_dble *ua,su3_vector_dble *sa, + su3_vector_dble *ra) +{ + _su3_multiply((*(ra )),(*(ua )),(*(sa ))); + _su3_multiply((*(ra+1)),(*(ua )),(*(sa+1))); + _su3_multiply((*(ra+2)),(*(ua+1)),(*(sa+2))); + _su3_multiply((*(ra+3)),(*(ua+1)),(*(sa+3))); + _su3_multiply((*(ra+4)),(*(ua+2)),(*(sa+4))); + _su3_multiply((*(ra+5)),(*(ua+2)),(*(sa+5))); + _su3_multiply((*(ra+6)),(*(ua+3)),(*(sa+6))); + _su3_multiply((*(ra+7)),(*(ua+3)),(*(sa+7))); +} + + +int main(void) +{ + int k,n,count; + double t1,t2,dt; +#if (defined x64) + double delta,diff,norm; +#endif + + printf("\n"); + printf("Time per double-precision SU(3) x SU(3)-vector multiplication\n"); + printf("-------------------------------------------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX instructions\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n"); +#endif + + printf("Measurement made with all data in cache\n\n"); + + rlxd_init(1,123456); + + for (k=0;k<4;k++) + random_su3_dble(u+k); + + gauss_dble((double*)(s),48); + gauss_dble((double*)(r),48); + gauss_dble((double*)(t),48); + +#if (defined x64) + + n=(int)(1.0e6); + dt=0.0; + + while (dt<2.0) + { + t1=(double)clock(); + for (count=0;countdelta) + delta=diff; + } + +#if (defined AVX) + printf("||U*w_AVX-U*w_FPU||<= %.1e*||w||\n",delta); +#else + printf("||U*w_SSE-U*w_FPU||<= %.1e*||w||\n",delta); +#endif + + fast_inverse_multiply(u,s,r); + slow_inverse_multiply(u,s,t); + delta=0.0; + + for (k=0;k<8;k++) + { + _vector_sub_assign(r[k],t[k]); + diff=_vector_prod_re(r[k],r[k]); + norm=_vector_prod_re(s[k],s[k]); + diff=sqrt(diff/norm); + if (diff>delta) + delta=diff; + } + +#if (defined AVX) + printf("||U^dag*w_AVX-U^dag*w_FPU||<= %.1e*||w||\n",delta); +#else + printf("||U^dag*w_SSE-U^dag*w_FPU||<= %.1e*||w||\n",delta); +#endif +#endif + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/time3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/time3.c new file mode 100644 index 0000000000000000000000000000000000000000..c0776e3591b01124df5325bba8091d6893cedaff --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/time3.c @@ -0,0 +1,194 @@ + +/******************************************************************************* +* +* File time3.c +* +* Copyright (C) 2005, 2009, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of su3xsu3, su3dagxsu3, ... +* +*******************************************************************************/ + +#include +#include +#include +#include +#include "random.h" +#include "su3.h" +#include "utils.h" +#include "su3fcts.h" + + +int main(void) +{ + int n,count; + double t1,t2,dt; + su3_dble *u,*v,*w; + u3_alg_dble *X; + + printf("\n"); + printf("Timing of su3xsu3, su3dagxsu3, ...\n"); + printf("----------------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX instructions\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n"); +#endif + + printf("Measurement made with all data in cache\n\n"); + + u=amalloc(3*sizeof(su3_dble),4); + X=amalloc(sizeof(u3_alg_dble),3); + error((u==NULL)||(X==NULL),1,"main [time3.c]", + "Unable to allocate auxiliary array"); + v=u+1; + w=u+2; + + rlxd_init(1,23456); + random_su3_dble(u); + random_su3_dble(v); + ranlxd((double*)(&(*X).c1),9); + + n=(int)(1.0e6); + dt=0.0; + + while (dt<2.0) + { + t1=(double)clock(); + for (count=0;count +#include +#include +#include +#include "random.h" +#include "su3.h" +#include "utils.h" +#include "su3fcts.h" + + +int main(void) +{ + int n,count; + double t1,t2,dt; + su3_dble *u,*v; + su3_alg_dble *X; + u3_alg_dble *Y; + + printf("\n"); + printf("Timing of prod2su3alg, prod2u3alg and rotate_su3alg\n"); + printf("---------------------------------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX instructions\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n"); +#endif + + printf("Measurement made with all data in cache\n\n"); + + u=amalloc(2*sizeof(*u),4); + X=amalloc(sizeof(*X),4); + Y=amalloc(sizeof(*Y),4); + error((u==NULL)||(X==NULL)||(Y==NULL),1, + "main [time4.c]","Unable to allocate auxiliary variables"); + v=u+1; + + rlxd_init(1,23456); + random_su3_dble(u); + random_su3_dble(v); + ranlxd((double*)(X),8); + ranlxd((double*)(Y),9); + + n=(int)(1.0e6); + dt=0.0; + + while (dt<2.0) + { + t1=(double)clock(); + for (count=0;count +#include +#include +#include +#include +#include "utils.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" + +static double mu[3]; +static su3_alg_dble *X; +static su3_dble *r,*u,*v,*w,*uu; +static const su3_dble u0={{0.0}}; +static ch_drv0_t *sp; +static ch_drv1_t *sg; +static ch_drv2_t *sf; +static double eps; + + +static void alloc_Xu(void) +{ + X=amalloc(2*sizeof(*X),4); + r=amalloc(4*sizeof(*r),4); + sp=amalloc(2*sizeof(*sp),4); + sg=amalloc(2*sizeof(*sg),4); + sf=amalloc(2*sizeof(*sf),4); + uu=amalloc(2*sizeof(*uu),4); + + error((X==NULL)||(r==NULL)||(sp==NULL)||(sf==NULL)||(sg==NULL)||(uu==NULL),1, + "alloc_Xu [time5.c]","Unable to allocate matrices"); + + u=r+1; + v=r+2; + w=r+3; +} + + +static void random_X(void) +{ + int i; + + ranlxd(&eps,1); + eps*=0.5; + + for (i=0;i<2;i++) + { + for (;;) + { + ranlxd(mu,2); + mu[0]=2.0*mu[0]-1.0; + mu[1]=2.0*mu[1]-1.0; + mu[2]=-mu[0]-mu[1]; + + if (fabs(mu[2])<=1.0) + break; + } + + (*u)=u0; + (*u).c11.im=mu[0]; + (*u).c22.im=mu[1]; + (*u).c33.im=mu[2]; + + random_su3_dble(r); + su3xsu3(r,u,w); + su3xsu3dag(w,r,u); + + X[i].c1=((*u).c11.im-(*u).c22.im)/3.0; + X[i].c2=((*u).c11.im-(*u).c33.im)/3.0; + X[i].c3=(*u).c12.re; + X[i].c4=(*u).c12.im; + X[i].c5=(*u).c13.re; + X[i].c6=(*u).c13.im; + X[i].c7=(*u).c23.re; + X[i].c8=(*u).c23.im; + } + + random_su3_dble(uu); + random_su3_dble(uu+1); +} + + +static int eval_nsplt(double eps,su3_alg_dble *X) +{ + double nfrb; + int n; + + nfrb=4.0*(3.0*((*X).c1*(*X).c1+(*X).c2*(*X).c2-(*X).c1*(*X).c2)+ + (*X).c3*(*X).c3+(*X).c4*(*X).c4+(*X).c5*(*X).c5+ + (*X).c6*(*X).c6+(*X).c7*(*X).c7+(*X).c8*(*X).c8); + + nfrb*=eps*eps; + n=0; + while(nfrb>3.0) + { + nfrb*=0.25; + n++; + } + + return n; +} + + +static int find_N(void) +{ + int i; + double r; + + r=1.0; + + for (i=1;r>DBL_EPSILON;i++) + r/=(double)(i); + + i+=7; + + return i+(i%2); +} + + +int main(void) +{ + int k,n,count,ns,nsplt,nop; + double t1,t2,dt; + + printf("\n"); + printf("Timing of chexp_drv*(), ch2mat() and expXsu3()\n"); + printf("----------------------------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX and SSE3 instructions\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n"); +#endif + + printf("Measurement made with all data in cache\n\n"); + + alloc_Xu(); + rlxd_init(1,12345); + random_X(); + ns=find_N(); + + n=(int)(1.0e6); + dt=0.0; + + while (dt<2.0) + { + t1=(double)clock(); + for (count=0;count +#include +#include +#include "random.h" +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "sw_term.h" + +typedef union +{ + weyl w; + complex c[6]; + float r[12]; +} spin_t; + +typedef union +{ + spinor s; + complex c[12]; + float r[24]; +} spin2_t; + +typedef union +{ + complex c[36]; + float r[72]; +} mat_t; + +static pauli mp[2] ALIGNED16; +static spin_t s1,s2,r1,r2 ALIGNED16; +static spin2_t sd1,sd2,rd1,rd2 ALIGNED16; +static mat_t mv[2] ALIGNED16; + + +static void cpvec(int n,complex *s,complex *r) +{ + int i; + + for (i=0;ii) + { + mv[im].c[6*i+j].re= mv[im].c[6*j+i].re; + mv[im].c[6*i+j].im=-mv[im].c[6*j+i].im; + } + else + mv[im].c[6*i+j].im=0.0f; + } + } + } + + for (im=0;im<2;im++) + { + k=6; + + for (i=0;i<6;i++) + { + mp[im].u[i]=mv[im].c[6*i+i].re; + + for (j=i+1;j<6;j++) + { + mp[im].u[k]=mv[im].c[6*i+j].re; + k+=1; + mp[im].u[k]=mv[im].c[6*i+j].im; + k+=1; + } + } + } + + gauss(s1.r,12); + cpvec(6,s1.c,s2.c); + mul_pauli(mu,mp,&(s1.w),&(r1.w)); + + error(diffvec(6,s1.c,s2.c),1,"main [check1.c]", + "mul_pauli() modifies the source spinor"); + + cmat_vec(6,mv[0].c,s2.c,r2.c); + + for (i=0;i<6;i++) + { + r2.c[i].re-=mu*s2.c[i].im; + r2.c[i].im+=mu*s2.c[i].re; + } + + printf("mul_pauli():\n"); + printf("r1: result, r2: expected result\n\n"); + + for (i=0;i<2;i++) + { + for (j=0;j<3;j++) + { + k=3*i+j; + printf("r1.c%d.c%d=(% .7e,% .7e)\n",i+1,j+1,r1.c[k].re,r1.c[k].im); + printf("r2.c%d.c%d=(% .7e,% .7e)\n",i+1,j+1,r2.c[k].re,r2.c[k].im); + printf("\n"); + } + } + + dmax=0.0f; + + for (i=0;i<12;i++) + { + d=(float)(fabs((double)(r1.r[i]-r2.r[i]))); + if (d>dmax) + dmax=d; + } + + printf("Maximal absolute deviation = %.1e\n",dmax); + + mul_pauli(mu,mp,&(s1.w),&(s1.w)); + error(diffvec(6,s1.c,r1.c),1,"main [check1.c]", + "mul_pauli() is incorrect when r=s"); + printf("Works correctly if input and output spinors coincide\n\n"); + + gauss(sd1.r,24); + cpvec(12,sd1.c,sd2.c); + mul_pauli2(mu,mp,&(sd1.s),&(rd1.s)); + + error(diffvec(12,sd1.c,sd2.c),1,"main [check1.c]", + "mul_pauli2() modifies the source spinor"); + + cmat_vec(6,mv[0].c,sd2.c,rd2.c); + cmat_vec(6,mv[1].c,sd2.c+6,rd2.c+6); + + for (i=0;i<6;i++) + { + rd2.c[i].re-=mu*sd2.c[i].im; + rd2.c[i].im+=mu*sd2.c[i].re; + } + + for (i=6;i<12;i++) + { + rd2.c[i].re+=mu*sd2.c[i].im; + rd2.c[i].im-=mu*sd2.c[i].re; + } + + printf("mul_pauli2():\n"); + printf("r1: result, r2: expected result\n\n"); + + for (i=0;i<4;i++) + { + for (j=0;j<3;j++) + { + k=3*i+j; + printf("r1.c%d.c%d=(% .7e,% .7e)\n",i+1,j+1,rd1.c[k].re,rd1.c[k].im); + printf("r2.c%d.c%d=(% .7e,% .7e)\n",i+1,j+1,rd2.c[k].re,rd2.c[k].im); + printf("\n"); + } + } + + dmax=0.0f; + + for (i=0;i<24;i++) + { + d=(float)(fabs((double)(rd1.r[i]-rd2.r[i]))); + if (d>dmax) + dmax=d; + } + + printf("Maximal absolute deviation = %.1e\n",dmax); + + mul_pauli2(mu,mp,&(sd1.s),&(sd1.s)); + error(diffvec(12,sd1.c,rd1.c),1,"main [check1.c]", + "mul_pauli2() is incorrect when r=s"); + printf("Works correctly if input and output spinors coincide\n\n"); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..038651072b11a801bcd76fe847e8144c906681b3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check2.c @@ -0,0 +1,163 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2005, 2009, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of mul_pauli_dble() +* +*******************************************************************************/ + +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "sw_term.h" + +typedef union +{ + weyl_dble w; + complex_dble c[6]; + double r[12]; +} spin_t; + +typedef union +{ + complex_dble c[36]; + double r[72]; +} mat_t; + +#if (defined AVX) +static pauli_dble mp ALIGNED32; +static spin_t s1,s2,r1,r2 ALIGNED32; +static mat_t mv ALIGNED32; +#else +static pauli_dble mp ALIGNED16; +static spin_t s1,s2,r1,r2 ALIGNED16; +static mat_t mv ALIGNED16; +#endif + +static void cpvec(int n,complex_dble *s,complex_dble *r) +{ + int i; + + for (i=0;ii) + { + mv.c[6*i+j].re= mv.c[6*j+i].re; + mv.c[6*i+j].im=-mv.c[6*j+i].im; + } + else + mv.c[6*i+j].im=0.0; + } + } + + k=6; + + for (i=0;i<6;i++) + { + mp.u[i]=mv.c[6*i+i].re; + + for (j=i+1;j<6;j++) + { + mp.u[k]=mv.c[6*i+j].re; + k+=1; + mp.u[k]=mv.c[6*i+j].im; + k+=1; + } + } + + cpvec(6,s1.c,s2.c); + mul_pauli_dble(mu,&mp,&(s1.w),&(r1.w)); + + error(diffvec(6,s1.c,s2.c),1,"main [check2.c]", + "mul_pauli_dble() modifies the source spinor"); + + cmat_vec_dble(6,mv.c,s2.c,r2.c); + + for (i=0;i<6;i++) + { + r2.c[i].re-=mu*s2.c[i].im; + r2.c[i].im+=mu*s2.c[i].re; + } + + printf("r1: result, r2: expected result\n\n"); + + for (i=0;i<2;i++) + { + for (j=0;j<3;j++) + { + k=3*i+j; + printf("r1.c%d.c%d=(% .7e,% .7e)\n",i+1,j+1,r1.c[k].re,r1.c[k].im); + printf("r2.c%d.c%d=(% .7e,% .7e)\n",i+1,j+1,r2.c[k].re,r2.c[k].im); + printf("\n"); + } + } + + dmax=0.0; + + for (i=0;i<12;i++) + { + d=fabs(r1.r[i]-r2.r[i]); + if (d>dmax) + dmax=d; + } + + printf("Maximal absolute deviation = %.1e\n",dmax); + + mul_pauli_dble(mu,&mp,&(s1.w),&(s1.w)); + error(diffvec(6,s1.c,r1.c),1,"main [check2.c]", + "mul_pauli_dble() is incorrect when r=s"); + printf("Works correctly if input and output spinors coincide\n\n"); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..90d5df0988943d24052e41366703ec2bc3644440 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check3.c @@ -0,0 +1,200 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of assign_pauli() and apply_sw() +* +*******************************************************************************/ + +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "sw_term.h" + +#define NM 131 + +typedef union +{ + spinor s; + complex c[12]; + float r[24]; +} spin_t; + +static pauli m[2*NM] ALIGNED16; +static pauli_dble md[2*NM] ALIGNED16; +static spin_t sp1[NM],sp2[NM],rp1[NM],rp2[NM] ALIGNED16; +static complex mv[36] ALIGNED16; + + +static void random_pauli_dble(void) +{ + int i; + double *u; + + for (i=0;i<(2*NM);i++) + { + u=md[i].u; + gauss_dble(u,36); + } +} + + +static float diff_pauli(void) +{ + int i,j; + float d,dmax,*u; + double *ud; + + dmax=0.0f; + + for (i=0;i<(2*NM);i++) + { + u=m[i].u; + ud=md[i].u; + + for (j=0;j<36;j++) + { + d=u[j]-(float)(ud[j]); + if (d<0.0f) + d=-d; + if (d>dmax) + dmax=d; + } + } + + return dmax; +} + + +static void random_spin(void) +{ + int i; + + for (i=0;idmax) + dmax=d; + } + } + + return dmax; +} + + +static void pauli2mv(float mu,pauli *mp) +{ + int i,j,k; + float *u; + + u=(*mp).u; + k=6; + + for (i=0;i<6;i++) + { + mv[6*i+i].re=u[i]; + mv[6*i+i].im=mu; + + for (j=i+1;j<6;j++) + { + mv[6*i+j].re=u[k]; + mv[6*j+i].re=u[k]; + k+=1; + mv[6*i+j].im=u[k]; + mv[6*j+i].im=-u[k]; + k+=1; + } + } +} + + +int main(void) +{ + int i; + float mu; + spinor *s1,*r1; + + printf("\n"); + printf("Check of assign_pauli() and apply_sw()\n"); + printf("--------------------------------------\n\n"); + +#if (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + rlxs_init(0,3898); + random_pauli_dble(); + assign_pauli(2*NM,md,m); + + printf("Check of assign_pauli():\nAbsolute deviation = %.1e\n\n", + diff_pauli()); + + random_spin(); + cp_spin(sp1,sp2); + mu=0.1234f; + + s1=(spinor*)(sp1); + r1=(spinor*)(rp1); + apply_sw(NM,mu,m,s1,r1); + + error(diff_spin(sp1,sp2)!=0.0f,1,"main [check3.c]", + "apply_sw() does not preserve the input spinor field"); + + for (i=0;i +#include +#include +#include "random.h" +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "sw_term.h" + +#define NM 1001 + +typedef union +{ + spinor_dble s; + complex_dble c[12]; + double r[24]; +} spin_t; + +static pauli_dble m[2*NM] ALIGNED16; +static spin_t sp1[NM],sp2[NM],rp1[NM],rp2[NM] ALIGNED16; +static complex_dble mv[36] ALIGNED16; + + +static void random_pauli_dble(void) +{ + int i,j; + double *u; + + for (i=0;i<(2*NM);i++) + { + u=m[i].u; + gauss_dble(u,36); + + for (j=0;j<6;j++) + u[j]+=10.0; + } +} + + +static void random_spin(void) +{ + int i; + + for (i=0;idmax) + dmax=d; + } + } + + return dmax; +} + + +static void pauli2mv(double mu,pauli_dble *mp) +{ + int i,j,k; + double *u; + + u=(*mp).u; + k=6; + + for (i=0;i<6;i++) + { + mv[6*i+i].re=u[i]; + mv[6*i+i].im=mu; + + for (j=i+1;j<6;j++) + { + mv[6*i+j].re=u[k]; + mv[6*j+i].re=u[k]; + k+=1; + mv[6*i+j].im=u[k]; + mv[6*j+i].im=-u[k]; + k+=1; + } + } +} + + +int main(void) +{ + int i,ie; + double mu; + spinor_dble *s1,*r1; + + printf("\n"); + printf("Check of apply_sw_dble() and apply_swinv_dble()\n"); + printf("-----------------------------------------------\n\n"); + +#if (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + rlxd_init(1,3898); + s1=(spinor_dble*)(sp1); + r1=(spinor_dble*)(rp1); + mu=0.0123; + + random_pauli_dble(); + random_spin(); + cp_spin(sp1,sp2); + apply_sw_dble(NM,mu,m,s1,r1); + + error(diff_spin(sp1,sp2)!=0.0,1,"main [check4.c]", + "apply_sw_dble() does not preserve the input spinor field"); + + for (i=0;i +#include +#include +#include "random.h" +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "sw_term.h" + +static pauli_dble ma[3] ALIGNED16; +static complex_dble aa[4][36] ALIGNED16; + + +static void random_pauli(pauli_dble *m) +{ + int i; + double *u; + + u=(*m).u; + gauss_dble(u,36); + + for (i=0;i<6;i++) + (*m).u[i]+=10.0; +} + + +static void pauli2mat(pauli_dble *m,complex_dble *a) +{ + int i,j,k; + double *u; + + u=(*m).u; + k=6; + + for (i=0;i<6;i++) + { + a[6*i+i].re=u[i]; + a[6*i+i].im=0.0; + + for (j=i+1;j<6;j++) + { + a[6*i+j].re=u[k]; + a[6*j+i].re=u[k]; + k+=1; + a[6*i+j].im=u[k]; + a[6*j+i].im=-u[k]; + k+=1; + } + } +} + + +int main(void) +{ + int i,j,ie; + double mu,d,dmax; + + printf("\n"); + printf("Check of inv_pauli_dble()\n"); + printf("-------------------------\n\n"); + +#if (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + rlxd_init(1,3898); + mu=0.1234; + ie=1; + + while (ie) + { + random_pauli(ma); + ie=inv_pauli_dble(mu,ma,ma+1); + } + + pauli2mat(ma,aa[0]); + pauli2mat(ma+1,aa[1]); + cmat_mul_dble(6,aa[0],aa[0],aa[2]); + + for (i=0;i<6;i++) + aa[2][6*i+i].re+=mu*mu; + + cmat_mul_dble(6,aa[1],aa[2],aa[3]); + cmat_sub_dble(6,aa[3],aa[0],aa[2]); + dmax=0.0; + + for (i=0;i<6;i++) + { + for (j=0;j<6;j++) + { + d=aa[2][6*i+j].re; + + if (d<0.0) + d=-d; + if (d>dmax) + dmax=d; + + d=aa[2][6*i+j].im; + + if (d<0.0) + d=-d; + if (d>dmax) + dmax=d; + } + } + + printf("Maximal absolute deviation = %.1e\n",dmax); + + inv_pauli_dble(mu,ma,ma); + dmax=0.0; + + for (i=0;i<36;i++) + { + d=ma[0].u[i]-ma[1].u[i]; + + if (d<0.0) + d=-d; + if (d>dmax) + dmax=d; + } + + error(dmax!=0.0,1,"main [check5.c]", + "inv_pauli_dble() is incorrect when m=im"); + printf("Works correctly if input and output matrices coincide\n\n"); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check6.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check6.c new file mode 100644 index 0000000000000000000000000000000000000000..acadea1b0af8336b93b5432a340fe026f19a0c7b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check6.c @@ -0,0 +1,159 @@ + +/******************************************************************************* +* +* File check6.c +* +* Copyright (C) 2005, 2009, 2010 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Accuracy of inv_pauli_dble() +* +*******************************************************************************/ + +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "sw_term.h" + +#define NM 10000 + +typedef union +{ + weyl w; + complex c[6]; +} spin_t; + +typedef union +{ + weyl_dble w; + complex_dble c[6]; +} spin_dble_t; + +static spin_t vs ALIGNED16; +static spin_dble_t vd ALIGNED16; +static const weyl vs0={{{0.0f}}}; +static const weyl_dble vd0={{{0.0}}}; + + +int main(void) +{ + int n,k,l,itot,*is; + double mu,fact,d,dmax; + pauli *ms,*ims,*msb,*imsb; + pauli_dble *md,*imd,*mdb,*imdb; + + printf("\n"); + printf("Accuracy of inv_pauli_dble()\n"); + printf("----------------------------\n\n"); + +#if (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + is=amalloc(NM*sizeof(*is),3); + msb=amalloc(3*NM*sizeof(*msb),4); + mdb=amalloc(3*NM*sizeof(*mdb),4); + error((is==NULL)||(msb==NULL)||(mdb==NULL),1, + "main [check6.c]","Unable to allocate auxiliary arrays"); + + imsb=msb+NM; + imdb=mdb+NM; + + rlxd_init(1,1234); + mu=0.0123; + md=mdb; + imd=imdb; + itot=0; + dmax=0.0; + fact=sqrt(2.0); + + for (n=0;ndmax) + dmax=d; + } + } + } + else + itot+=1; + + md+=1; + imd+=1; + } + + printf("Double-precision program, mu=%.4f:\n",mu); + printf("%d Gaussian random matrices, %d inversion failures\n",NM,itot); + printf("Maximal relative deviation = %.1e ",sqrt(dmax)); + printf("(safe cases only)\n\n"); + + assign_pauli(NM,mdb,msb); + assign_pauli(2*NM,imdb,imsb); + + ms=msb; + ims=imsb; + dmax=0.0; + + for (n=0;ndmax) + dmax=d; + } + } + } + + ms+=1; + ims+=1; + } + + printf("After assignment to single-precision matrices:\n"); + printf("Maximal relative deviation = %.1e ",sqrt(dmax)); + printf("(safe cases only)\n\n"); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check7.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check7.c new file mode 100644 index 0000000000000000000000000000000000000000..998b01d6726780ad2d3ed60937a92aac2c8f21fb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check7.c @@ -0,0 +1,242 @@ + +/******************************************************************************* +* +* File check7.c +* +* Copyright (C) 2005, 2009, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of det_pauli_dble() +* +*******************************************************************************/ + +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "sw_term.h" + +#define NM 10000 + +static double dd[6] ALIGNED16; +static complex_dble aa[36],bb[36],vv[36],ww[36] ALIGNED16; + + +static complex_dble random_dd(double mu) +{ + int i; + complex_dble det,z; + + ranlxd(dd,6); + det.re=1.0; + det.im=0.0; + + for (i=0;i<6;i++) + { + if (dd[i]<0.5) + dd[i]-=0.6; + else + dd[i]-=0.4; + + z.re=det.re*dd[i]-det.im*mu; + z.im=det.re*mu+det.im*dd[i]; + + det.re=z.re; + det.im=z.im; + } + + return det; +} + + +static double norm(complex_dble *v) +{ + int i; + double r; + + r=0.0; + + for (i=0;i<6;i++) + r+=(v[i].re*v[i].re+v[i].im*v[i].im); + + return sqrt(r); +} + + +static complex_dble prod(complex_dble *v,complex_dble *w) +{ + int i; + complex_dble z; + + z.re=0.0; + z.im=0.0; + + for (i=0;i<6;i++) + { + z.re+=(v[i].re*w[i].re+v[i].im*w[i].im); + z.im+=(v[i].re*w[i].im-v[i].im*w[i].re); + } + + return z; +} + + +static void proj(complex_dble *v,complex_dble *w) +{ + int i; + complex_dble z; + + z=prod(v,w); + + for (i=0;i<6;i++) + { + w[i].re-=(z.re*v[i].re-z.im*v[i].im); + w[i].im-=(z.re*v[i].im+z.im*v[i].re); + } +} + + +static void random_vv(void) +{ + int i,j; + double r,ri[12]; + complex_dble *vi; + + for (i=0;i<6;i++) + { + vi=vv+6*i; + r=0.0; + + while (r<1.0) + { + gauss_dble(ri,12); + + for (j=0;j<6;j++) + { + vi[j].re=ri[2*j]; + vi[j].im=ri[2*j+1]; + } + + for (j=0;jdmax) + dmax=d; + + md+=1; + } + + printf("%d Gaussian random matrices M, mu=%.4f\n",NM,mu); + printf("Maximal relative deviation of det(M+i*mu) = %.1e\n\n",dmax); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/time1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/time1.c new file mode 100644 index 0000000000000000000000000000000000000000..e046fcc79b91d70c51c765e58d80c0975014b36d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/time1.c @@ -0,0 +1,117 @@ + +/******************************************************************************* +* +* File time1.c +* +* Copyright (C) 2005, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of mul_pauli() and mul_pauli2() +* +*******************************************************************************/ + +#include +#include +#include +#include +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "sw_term.h" + +typedef union +{ + weyl w; + float r[12]; +} spin_t; + +typedef union +{ + spinor s; + float r[24]; +} spin2_t; + +static pauli mp[4] ALIGNED16; +static spin_t s1,s2,r1,r2 ALIGNED16; +static spin2_t sd1,sd2,rd1,rd2 ALIGNED16; + + +int main(void) +{ + int n,count; + float mu1,mu2; + double t1,t2,dt; + + printf("\n"); + printf("Timing of mul_pauli() and mul_pauli2()\n"); + printf("--------------------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX instructions\n\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + printf("Measurement made with all data in cache\n\n"); + + rlxs_init(0,23456); + + for (n=0;n<4;n++) + ranlxs(mp[n].u,36); + + ranlxs(s1.r,12); + ranlxs(s2.r,12); + ranlxs(sd1.r,24); + ranlxs(sd2.r,24); + + mu1=0.1234f; + mu2=0.5678f; + + n=(int)(1.0e6); + dt=0.0; + + while (dt<2.0) + { + t1=(double)clock(); + for (count=0;count +#include +#include +#include +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "sw_term.h" + +typedef union +{ + weyl_dble w; + double r[12]; +} spin_t; + +#if (defined AVX) +static pauli_dble mp1,mp2 ALIGNED32; +static spin_t s1,s2,r1,r2 ALIGNED32; +#else +static pauli_dble mp1,mp2 ALIGNED16; +static spin_t s1,s2,r1,r2 ALIGNED16; +#endif + +int main(void) +{ + int n,count; + double mu1,mu2; + double t1,t2,dt; + + printf("\n"); + printf("Timing of mul_pauli_dble()\n"); + printf("--------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX instructions\n\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + printf("Measurement made with all data in cache\n\n"); + + rlxd_init(1,23456); + ranlxd(mp1.u,36); + ranlxd(mp2.u,36); + ranlxd(s1.r,12); + ranlxd(s2.r,12); + mu1=0.1234; + mu2=0.5678; + + n=(int)(1.0e6); + dt=0.0; + + while (dt<2.0) + { + t1=(double)clock(); + for (count=0;count +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "sw_term.h" + + +int main(void) +{ + int n,count,itest; + double t1,t2,dt,mu; + pauli_dble *m; + + printf("\n"); + printf("Timing of inv_pauli_dble() and det_pauli_dble()\n"); + printf("-----------------------------------------------\n\n"); + +#if (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + printf("Measurement made with all data in cache\n\n"); + + m=amalloc(2*sizeof(*m),4); + error(m==NULL,1,"main [time3.c]", + "Unable to allocate auxiliary arrays"); + + rlxd_init(1,23456); + ranlxd((*m).u,36); + mu=0.1234; + + for (n=0;n<6;n++) + (*m).u[n]=1.0; + + for (n=6;n<36;n++) + (*m).u[n]=0.01*((*m).u[n]-0.5); + + n=(int)(1.0e5); + dt=0.0; + itest=0; + + while (dt<2.0) + { + t1=(double)clock(); + for (count=0;count +#include +#include +#include "su3.h" +#include "random.h" +#include "utils.h" + +#define N 18000 + +static int istd[N],istds[N]; +static double dstd[N],dstds[N]; +static su3_dble ufld[N/18]; + + +static void set_u2v(su3_dble *u,double *v) +{ + v[ 0]=(*u).c11.re; + v[ 1]=(*u).c11.im; + v[ 2]=(*u).c12.re; + v[ 3]=(*u).c12.im; + v[ 4]=(*u).c13.re; + v[ 5]=(*u).c13.im; + + v[ 6]=(*u).c21.re; + v[ 7]=(*u).c21.im; + v[ 8]=(*u).c22.re; + v[ 9]=(*u).c22.im; + v[10]=(*u).c23.re; + v[11]=(*u).c23.im; + + v[12]=(*u).c31.re; + v[13]=(*u).c31.im; + v[14]=(*u).c32.re; + v[15]=(*u).c32.im; + v[16]=(*u).c33.re; + v[17]=(*u).c33.im; +} + + +static void set_v2u(double *v,su3_dble *u) +{ + (*u).c11.re=v[ 0]; + (*u).c11.im=v[ 1]; + (*u).c12.re=v[ 2]; + (*u).c12.im=v[ 3]; + (*u).c13.re=v[ 4]; + (*u).c13.im=v[ 5]; + + (*u).c21.re=v[ 6]; + (*u).c21.im=v[ 7]; + (*u).c22.re=v[ 8]; + (*u).c22.im=v[ 9]; + (*u).c23.re=v[10]; + (*u).c23.im=v[11]; + + (*u).c31.re=v[12]; + (*u).c31.im=v[13]; + (*u).c32.re=v[14]; + (*u).c32.im=v[15]; + (*u).c33.re=v[16]; + (*u).c33.im=v[17]; +} + + +int main(void) +{ + int ie,k,it; + stdint_t i[2]; + double d[2]; + char *ci[2],*cd[2]; + + printf("\n"); + printf("Test of the endianness and byte swapping programs\n"); + printf("-------------------------------------------------\n\n"); + + printf("sizeof(stdint_t) = %d\n",(int)(sizeof(stdint_t))); + printf("sizeof(double) = %d\n",(int)(sizeof(double))); + + ie=endianness(); + if (ie==LITTLE_ENDIAN) + printf("The machine is little endian\n\n"); + else if (ie==BIG_ENDIAN) + printf("The machine is big endian\n\n"); + else + printf("The machine has unknown endianness\n\n"); + + ci[0]=(char*)(i); + ci[1]=(char*)(i+1); + + ci[0][0]='A'; + ci[0][1]='B'; + ci[0][2]='C'; + ci[0][3]='D'; + + ci[1][0]='1'; + ci[1][1]='2'; + ci[1][2]='3'; + ci[1][3]='4'; + + printf("Byte swapping integers:\n"); + printf("%.4s, %.4s -> ",ci[0],ci[1]); + bswap_int(2,i); + printf("%.4s, %.4s\n\n",ci[0],ci[1]); + + cd[0]=(char*)(d); + cd[1]=(char*)(d+1); + + cd[0][0]='A'; + cd[0][1]='B'; + cd[0][2]='C'; + cd[0][3]='D'; + cd[0][4]='E'; + cd[0][5]='F'; + cd[0][6]='G'; + cd[0][7]='H'; + + cd[1][0]='1'; + cd[1][1]='2'; + cd[1][2]='3'; + cd[1][3]='4'; + cd[1][4]='5'; + cd[1][5]='6'; + cd[1][6]='7'; + cd[1][7]='8'; + + printf("Byte swapping double precision numbers:\n"); + printf("%.8s, %.8s -> ",cd[0],cd[1]); + bswap_double(2,d); + printf("%.8s, %.8s\n\n",cd[0],cd[1]); + + gauss_dble(dstd,N); + + for (k=0;k +#include +#include +#include "utils.h" + + +int main(void) +{ + int n; + double x; + + printf("\n"); + printf("Test of the program fdigits()\n"); + printf("-----------------------------\n\n"); + + while (1) + { + printf("x = "); + scanf("%lf",&x); + n=fdigits(x); + printf(" %.*f\n\n",n,x); + } + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/random/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/random/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..983f469eaa45064ac27b61aca22e39a958c9825e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/random/INDEX @@ -0,0 +1,6 @@ + +Random number generation programs + +check1 Check of import/export functions for the ranlux generators + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/random/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/random/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..89d23571d1f9839760b4f43c37b594c1e00ee040 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/random/Makefile @@ -0,0 +1,122 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 + +FLAGS = flags lat_parms + +LATTICE = geometry + +RANDOM = gauss ranlux ranlxs ranlxd + +UTILS = mutils utils endian + +MODULES = $(FLAGS) $(LATTICE) $(RANDOM) $(UTILS) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/random:$(MDIR)/su3fcts:\ + $(MDIR)/uflds:$(MDIR)/utils + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog \ + *.log~ *.dat *.dat~ $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/random/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/random/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..9b01e6ba149458d6b39bf0087ea600e2d06de929 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/random/check1.c @@ -0,0 +1,91 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of import/export functions for the ranlux generators +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "global.h" + +#define NRAN 10000 + +static float r[2*NRAN]; +static double rd[2*NRAN]; + + +int main(int argc,char *argv[]) +{ + int my_rank,tag,k,ie,ied; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + + printf("\n"); + printf("Check of import/export functions for the ranlux generators\n"); + printf("----------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + } + + start_ranlux(0,1234); + ranlxs(r,NRAN); + ranlxd(rd,NRAN); + tag=98029; + + export_ranlux(tag,"check1.dat"); + ranlxs(r,NRAN); + ranlxd(rd,NRAN); + + k=import_ranlux("check1.dat"); + error (k!=tag,1,"main [check1.c]", + "Import_ranlux() returns incorrect tag"); + + ranlxs(r+NRAN,NRAN); + ranlxd(rd+NRAN,NRAN); + + ie=0; + ied=0; + + for (k=0;k +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "ratfcts.h" +#include "global.h" + + +static double eval_rat1(ratfct_t *rf,double x) +{ + int np,i; + double *mu,*rmu,r; + + np=(*rf).np; + mu=(*rf).mu; + rmu=(*rf).rmu; + r=0.0; + + for (i=0;idmax) + dmax=d; + } + + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return dmax; +} + + +static double diff_rat2(double ra,double rb,ratfct_t *rf) +{ + int k; + double r,x,d,dmax; + + dmax=0.0; + + for (k=0;k<1000;k++) + { + ranlxd(&r,1); + x=ra+r*(rb-ra); + + d=fabs(1.0-eval_rat1(rf,x)*eval_rat2(rf,x)); + + if (d>dmax) + dmax=d; + } + + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return dmax; +} + + +static double diff_rat3(double ra,double rb,ratfct_t *rf) +{ + int k; + double r,x,d,dmax; + + dmax=0.0; + + for (k=0;k<1000;k++) + { + ranlxd(&r,1); + x=ra+r*(rb-ra); + + d=fabs(1.0-(eval_rat1(rf+1,x)*eval_rat1(rf+2,x))/eval_rat1(rf,x)); + + if (d>dmax) + dmax=d; + } + + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return dmax; +} + + +int main(int argc,char *argv[]) +{ + int my_rank,irat[3]; + int np1,i,j; + double dmax; + rat_parms_t rp; + ratfct_t rf[3]; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + fin=freopen("check1.in","r",stdin); + + printf("\n"); + printf("Initialization of rational functions\n"); + printf("------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + } + + read_rat_parms(0); + + if (my_rank==0) + fclose(fin); + + print_rat_parms(); + start_ranlux(0,123456); + + rp=rat_parms(0); + irat[0]=0; + irat[1]=0; + irat[2]=rp.degree-1; + rf[0]=ratfct(irat); + + if (my_rank==0) + { + printf("Complete rational function:\n"); + printf("np= %2d, A = %.2e, delta = %.2e\n", + rf[0].np,rf[0].A,rf[0].delta); + + printf(" i mu[i] rmu[i]\n"); + + for (i=0;i0.1); + + if ((np1>0)&&(np1 that +allows the type of boundary condition to be chosen at runtime. When the option +is not set, open boundary conditions are assumed. + +The option may be set but has no effect in the case of check3 (the boundary +conditions are selected through the input parameter file in this case). diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..67b3f56cb3163f712b5e703fb6eb839a9f91bf2e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/Makefile @@ -0,0 +1,149 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 time1 time2 + +FLAGS = flags lat_parms sap_parms dfl_parms + +LATTICE = bcnds ftidx uidx geometry + +LINALG = salg salg_dble liealg cmatrix_dble + +LINSOLV = fgcr + +RANDOM = ranlux ranlxs ranlxd gauss + +UFLDS = plaq_sum shift uflds udcom + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +UTILS = endian mutils utils wspace + +SFLDS = sflds scom sdcom Pbnd Pbnd_dble + +TCHARGE = ftcom ftensor + +SW_TERM = pauli pauli_dble swflds sw_term + +DIRAC = Dw_dble Dw Dw_bnd + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +SAP = blk_solv sap_com sap sap_gcr + +ARCHIVE = archive + +MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ + $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) $(DIRAC) \ + $(BLOCK) $(SAP) $(ARCHIVE) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/linsolv:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:\ + $(MDIR)/sflds:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:\ + $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive: + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + +# -DFGCR_DBG + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..2b636e5a84f145be9dcc45eb21a4ee2d4ab7442b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check1.c @@ -0,0 +1,225 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the block solver programs. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "block.h" +#include "dirac.h" +#include "sap.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc; + int nb,isw,ie,itm; + int bs[4],n,k,vol,volh; + float mu,res0,res[8],res_max[8]; + double phi[2],phi_prime[2]; + spinor **ps; + block_t *b; + tm_parms_t tm; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + fin=freopen("check1.in","r",stdin); + + printf("\n"); + printf("Check of the block solver programs\n"); + printf("----------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + fclose(fin); + + printf("bs = %d %d %d %d\n\n",bs[0],bs[1],bs[2],bs[3]); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check1.c]", + "Syntax: check1 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + set_sap_parms(bs,0,1,1); + alloc_bgr(SAP_BLOCKS); + alloc_ws(4); + + set_sw_parms(0.05); + mu=0.123f; + ps=reserve_ws(4); + + for (itm=0;itm<2;itm++) + { + if (itm==1) + set_tm_parms(1); + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + assign_ud2ubgr(SAP_BLOCKS); + assign_swd2swbgr(SAP_BLOCKS,NO_PTS); + + b=blk_list(SAP_BLOCKS,&nb,&isw); + vol=(*b).vol; + volh=vol/2; + + for (k=0;k<8;k++) + res_max[k]=0.0f; + + random_s(VOLUME,ps[0],1.0f); + bnd_s2zero(ALL_PTS,ps[0]); + set_s2zero(VOLUME,ps[1]); + + for (n=0;nres_max[k]) + res_max[k]=res[k]; + } + } + + error_chk(); + + if (NPROC>1) + { + MPI_Reduce(res_max,res,8,MPI_FLOAT,MPI_MAX,0,MPI_COMM_WORLD); + + for (k=0;k<8;k++) + res_max[k]=res[k]; + } + + if (my_rank==0) + { + tm=tm_parms(); + printf("Twisted-mass flag = %d\n",tm.eoflg); + printf("Check of blk_mres():\n"); + + for (k=0;k<8;k++) + printf("nmr = %2d, res_max = %.1e\n", + 4*(k+1),sqrt((double)(res_max[k]))); + } + + for (k=0;k<8;k++) + res_max[k]=0.0f; + + ie=assign_swd2swbgr(SAP_BLOCKS,ODD_PTS); + error_root(ie,1,"main [check1.c]", + "The inversion of the SW term was not safe"); + + random_s(VOLUME,ps[0],1.0f); + bnd_s2zero(ALL_PTS,ps[0]); + set_s2zero(VOLUME,ps[1]); + + for (n=0;nres_max[k]) + res_max[k]=res[k]; + } + } + + error_chk(); + + if (NPROC>1) + { + MPI_Reduce(res_max,res,8,MPI_FLOAT,MPI_MAX,0,MPI_COMM_WORLD); + + for (k=0;k<8;k++) + res_max[k]=res[k]; + } + + if (my_rank==0) + { + printf("Check of blk_eo_mres():\n"); + + for (k=0;k<8;k++) + printf("nmr = %2d, res_max = %.1e\n", + 3*(k+1),sqrt((double)(res_max[k]))); + + printf("\n"); + } + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check1.in new file mode 100644 index 0000000000000000000000000000000000000000..cb2b6435876b968f9ca35a085e965971e56ca69b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check1.in @@ -0,0 +1 @@ +bs 8 4 4 4 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..59722bcb362ccc5554dc7b69dad146263613ed41 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check2.c @@ -0,0 +1,224 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2005, 2008, 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Direct test of the Schwarz alternating procedure. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "block.h" +#include "dirac.h" +#include "sap.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc; + int n,ie,itm; + int bs[4],nmr; + float mu,res,del[3]; + double phi[2],phi_prime[2]; + spinor **ps; + tm_parms_t tm; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + fin=freopen("check2.in","r",stdin); + + printf("\n"); + printf("Direct test of the Schwarz alternating procedure\n"); + printf("------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + read_line("mu","%f",&mu); + read_line("nmr","%d",&nmr); + fclose(fin); + + printf("bs = %d %d %d %d\n",bs[0],bs[1],bs[2],bs[3]); + printf("mu = %.3e\n",mu); + printf("nmr = %d\n\n",nmr); + fflush(flog); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check2.c]", + "Syntax: check2 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_FLOAT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + set_sap_parms(bs,0,1,1); + alloc_bgr(SAP_BLOCKS); + alloc_ws(4); + ps=reserve_ws(4); + + set_sw_parms(0.05); + + for (itm=0;itm<2;itm++) + { + if (itm==0) + set_tm_parms(1); + else + set_tm_parms(0); + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + assign_ud2u(); + assign_swd2sw(); + assign_ud2ubgr(SAP_BLOCKS); + assign_swd2swbgr(SAP_BLOCKS,NO_PTS); + + set_s2zero(VOLUME,ps[0]); + random_s(VOLUME,ps[1],1.0f); + bnd_s2zero(ALL_PTS,ps[1]); + normalize(VOLUME,1,ps[1]); + assign_s2s(VOLUME,ps[1],ps[2]); + + if (my_rank==0) + { + tm=tm_parms(); + printf("Twisted-mass flag = %d\n",tm.eoflg); + printf("MinRes block solver:\n"); + } + + for (n=0;n<8;n++) + { + sap(mu,0,nmr,ps[0],ps[1]); + res=norm_square(VOLUME,1,ps[1]); + res=(float)(sqrt((double)(res))); + + if (my_rank==0) + printf("n = %d: \t residue = %.2e\t ",n+1,res); + + Dw(mu,ps[0],ps[3]); + mulr_spinor_add(VOLUME,ps[3],ps[2],-1.0f); + mulr_spinor_add(VOLUME,ps[3],ps[1],1.0f); + del[0]=norm_square(VOLUME,1,ps[3]); + del[0]=(float)(sqrt((double)(del[0]))); + + assign_s2s(VOLUME,ps[0],ps[3]); + bnd_s2zero(ALL_PTS,ps[3]); + mulr_spinor_add(VOLUME,ps[3],ps[0],-1.0f); + del[1]=norm_square(VOLUME,1,ps[3]); + del[1]=(float)(sqrt((double)(del[1]))); + + assign_s2s(VOLUME,ps[1],ps[3]); + bnd_s2zero(ALL_PTS,ps[3]); + mulr_spinor_add(VOLUME,ps[3],ps[1],-1.0f); + del[2]=norm_square(VOLUME,1,ps[3]); + del[2]=(float)(sqrt((double)(del[1]))); + + if (my_rank==0) + printf("check = %.2e, bnd checks = %.1e,%.1e\n", + del[0],del[1],del[2]); + } + + error_chk(); + + ie=assign_swd2swbgr(SAP_BLOCKS,ODD_PTS); + error_root(ie,1,"main [check2.c]", + "The inversion of the SW term was not safe"); + + set_s2zero(VOLUME,ps[0]); + random_s(VOLUME,ps[1],1.0f); + bnd_s2zero(ALL_PTS,ps[1]); + normalize(VOLUME,1,ps[1]); + assign_s2s(VOLUME,ps[1],ps[2]); + + if (my_rank==0) + { + printf("\n"); + printf("Even-odd preconditioned MinRes block solver:\n"); + } + + for (n=0;n<8;n++) + { + sap(mu,1,nmr,ps[0],ps[1]); + res=norm_square(VOLUME,1,ps[1]); + res=(float)(sqrt((double)(res))); + + if (my_rank==0) + printf("n = %d: \t residue = %.2e\t ",n+1,res); + + Dw(mu,ps[0],ps[3]); + mulr_spinor_add(VOLUME,ps[3],ps[2],-1.0f); + mulr_spinor_add(VOLUME,ps[3],ps[1],1.0f); + del[0]=norm_square(VOLUME,1,ps[3]); + del[0]=(float)(sqrt((double)(del[0]))); + + assign_s2s(VOLUME,ps[0],ps[3]); + bnd_s2zero(ALL_PTS,ps[3]); + mulr_spinor_add(VOLUME,ps[3],ps[0],-1.0f); + del[1]=norm_square(VOLUME,1,ps[3]); + del[1]=(float)(sqrt((double)(del[1]))); + + assign_s2s(VOLUME,ps[1],ps[3]); + bnd_s2zero(ALL_PTS,ps[3]); + mulr_spinor_add(VOLUME,ps[3],ps[1],-1.0f); + del[2]=norm_square(VOLUME,1,ps[3]); + del[2]=(float)(sqrt((double)(del[1]))); + + if (my_rank==0) + printf("check = %.2e, bnd checks = %.1e,%.1e\n", + del[0],del[1],del[2]); + } + + if (my_rank==0) + printf("\n"); + } + + error_chk(); + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check2.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check2.in new file mode 100644 index 0000000000000000000000000000000000000000..df121eea37c1470e8dc1d19b2fb0958929cc9eb3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check2.in @@ -0,0 +1,3 @@ +bs 8 4 4 4 +mu 0.123 +nmr 4 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..527fa3f25f0f8c182fbdd435c9822a5d96c344ee --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check3.c @@ -0,0 +1,260 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check and performance of the SAP+GCR solver. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "archive.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "dirac.h" +#include "sap.h" +#include "global.h" + +int my_rank,id,first,last,step; +int bs[4],nmr,ncy,nkv,nmx,eoflg,bc; +double kappa,csw,mu,cF,cF_prime; +double phi[2],phi_prime[2],m0,res; +char cnfg_dir[NAME_SIZE],cnfg_file[NAME_SIZE],nbase[NAME_SIZE]; + + +int main(int argc,char *argv[]) +{ + int isolv,nsize,icnfg,status; + double rho,nrm,del; + double wt1,wt2,wdt; + spinor_dble **psd; + lat_parms_t lat; + sap_parms_t sap; + tm_parms_t tm; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check3.log","w",stdout); + fin=freopen("check3.in","r",stdin); + + printf("\n"); + printf("Check and performance of the SAP+GCR solver\n"); + printf("-------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + find_section("Configurations"); + read_line("name","%s",nbase); + read_line("cnfg_dir","%s",cnfg_dir); + read_line("first","%d",&first); + read_line("last","%d",&last); + read_line("step","%d",&step); + + find_section("Lattice parameters"); + read_line("kappa","%lf",&kappa); + read_line("csw","%lf",&csw); + read_line("mu","%lf",&mu); + read_line("eoflg","%d",&eoflg); + + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + cF=1.0; + cF_prime=1.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + if (bc!=3) + read_line("cF","%lf",&cF); + + if (bc==2) + read_line("cF'","%lf",&cF_prime); + else + cF_prime=cF; + + find_section("SAP"); + read_iprms("bs",4,bs); + read_line("nmr","%d",&nmr); + read_line("ncy","%d",&ncy); + + find_section("GCR"); + read_line("nkv","%d",&nkv); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + + fclose(fin); + } + + MPI_Bcast(nbase,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(cnfg_dir,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(&first,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&last,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&step,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&csw,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&eoflg,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + lat=set_lat_parms(5.5,1.0,1,&kappa,csw); + print_lat_parms(); + + set_bc_parms(bc,1.0,1.0,cF,cF_prime,phi,phi_prime); + print_bc_parms(); + + sap=set_sap_parms(bs,0,nmr,ncy); + m0=lat.m0[0]; + (void)set_sw_parms(m0); + tm=set_tm_parms(eoflg); + + start_ranlux(0,1234); + geometry(); + alloc_ws(2*nkv+1); + alloc_wsd(5); + psd=reserve_wsd(3); + + if (my_rank==0) + { + printf("mu = %.6f\n",mu); + printf("eoflg = %d\n\n",tm.eoflg); + + printf("bs = (%d,%d,%d,%d)\n",sap.bs[0],sap.bs[1],sap.bs[2],sap.bs[3]); + printf("nmr = %d\n",sap.nmr); + printf("ncy = %d\n\n",sap.ncy); + + printf("nkv = %d\n",nkv); + printf("nmx = %d\n",nmx); + printf("res = %.2e\n\n",res); + + printf("Configurations %sn%d -> %sn%d in steps of %d\n\n", + nbase,first,nbase,last,step); + fflush(flog); + } + + error_root(((last-first)%step)!=0,1,"main [check3.c]", + "last-first is not a multiple of step"); + + nsize=name_size("%s/%sn%d",cnfg_dir,nbase,last); + error_root(nsize>=NAME_SIZE,1,"main [check3.c]", + "cnfg_dir name is too long"); + + for (icnfg=first;icnfg<=last;icnfg+=step) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + import_cnfg(cnfg_file); + + if (my_rank==0) + { + printf("Configuration no %d\n",icnfg); + fflush(flog); + } + + chs_ubnd(-1); + random_sd(VOLUME,psd[0],1.0); + bnd_sd2zero(ALL_PTS,psd[0]); + nrm=sqrt(norm_square_dble(VOLUME,1,psd[0])); + + for (isolv=0;isolv<2;isolv++) + { + assign_sd2sd(VOLUME,psd[0],psd[2]); + set_sap_parms(bs,isolv,nmr,ncy); + + rho=sap_gcr(nkv,nmx,res,mu,psd[0],psd[1],&status); + + error_chk(); + mulr_spinor_add_dble(VOLUME,psd[2],psd[0],-1.0); + del=norm_square_dble(VOLUME,1,psd[2]); + error_root(del!=0.0,1,"main [check3.c]", + "Source field is not preserved"); + + Dw_dble(mu,psd[1],psd[2]); + mulr_spinor_add_dble(VOLUME,psd[2],psd[0],-1.0); + del=sqrt(norm_square_dble(VOLUME,1,psd[2])); + + if (my_rank==0) + { + printf("isolv = %d:\n",isolv); + printf("status = %d\n",status); + printf("rho = %.2e, res = %.2e\n",rho,res); + printf("check = %.2e, check = %.2e\n",del,del/nrm); + } + + assign_sd2sd(VOLUME,psd[0],psd[2]); + + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + rho=sap_gcr(nkv,nmx,res,mu,psd[2],psd[2],&status); + + MPI_Barrier(MPI_COMM_WORLD); + wt2=MPI_Wtime(); + wdt=wt2-wt1; + + if (my_rank==0) + { + printf("time = %.2e sec (total)\n",wdt); + if (status>0) + printf(" = %.2e usec (per point and GCR iteration)", + (1.0e6*wdt)/((double)(status)*(double)(VOLUME))); + printf("\n\n"); + fflush(flog); + } + + mulr_spinor_add_dble(VOLUME,psd[2],psd[1],-1.0); + del=norm_square_dble(VOLUME,1,psd[2]); + error_root(del!=0.0,1,"main [check3.c]", + "Incorrect result when the input and " + "output fields coincide"); + } + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check3.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check3.in new file mode 100644 index 0000000000000000000000000000000000000000..4979a85d71f8f9b34f7d7c90ba517b9a035fb9d5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check3.in @@ -0,0 +1,30 @@ + +[Configurations] +name 16x8x8x8b6.00id2 +cnfg_dir /home/data/openQCD/cnfg +first 7 +last 7 +step 1 + +[Lattice parameters] +kappa 0.1280 +csw 1.2 +mu 0.0123 +eoflg 1 + +[Boundary conditions] +type 0 +#phi 0.12 -0.56 +#phi' 0.92 0.76 +cF 0.95 +#cF' 0.90 + +[SAP] +bs 4 4 4 4 +nmr 4 +ncy 5 + +[GCR] +nkv 16 +nmx 128 +res 1.0e-12 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/time1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/time1.c new file mode 100644 index 0000000000000000000000000000000000000000..09023a0157d209a5a5c87a0bd7407986b05ce7e1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/time1.c @@ -0,0 +1,206 @@ + +/******************************************************************************* +* +* File time1.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of blk_mres() and blk_eo_mres(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "sap.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,count,nt; + int nb,isw,nmr,bs[4]; + int n,ie; + float mu; + double phi[2],phi_prime[2]; + double wt1,wt2,wdt; + spinor **ps; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time1.log","w",stdout); + fin=freopen("time1.in","r",stdin); + + printf("\n"); + printf("Timing of blk_mres() and blk_eo_mres()\n"); + printf("--------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + +#if (defined x64) +#if (defined AVX) + printf("Using AVX instructions\n"); +#else + printf("Using SSE3 instructions and 16 xmm registers\n"); +#endif +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + printf("\n"); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + read_line("nmr","%d",&nmr); + fclose(fin); + + printf("bs = %d %d %d %d\n",bs[0],bs[1],bs[2],bs[3]); + printf("nmr = %d\n\n",nmr); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [time1.c]", + "Syntax: time1 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_ws(1); + set_sap_parms(bs,0,1,1); + alloc_bgr(SAP_BLOCKS); + + set_sw_parms(0.0123); + mu=0.0785f; + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + assign_ud2ubgr(SAP_BLOCKS); + assign_swd2swbgr(SAP_BLOCKS,NO_PTS); + + ps=reserve_ws(1); + random_s(VOLUME,ps[0],1.0f); + bnd_s2zero(ALL_PTS,ps[0]); + normalize(VOLUME,1,ps[0]); + blk_list(SAP_BLOCKS,&nb,&isw); + + nt=(int)(1.0e7/(double)(nmr*VOLUME)); + if (nt<2) + nt=2; + wdt=0.0; + + while (wdt<5.0) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + for (count=0;count +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "sap.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,count,nt; + int ncy,nmr,bs[4]; + int n,ie; + float mu; + double phi[2],phi_prime[2]; + double rbb,wt1,wt2,wdt; + spinor **ps; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time2.log","w",stdout); + fin=freopen("time2.in","r",stdin); + + printf("\n"); + printf("Timing of the SAP preconditioner\n"); + printf("--------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + +#if (defined x64) +#if (defined AVX) + printf("Using AVX instructions\n"); +#else + printf("Using SSE3 instructions and 16 xmm registers\n"); +#endif +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + printf("\n"); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + read_line("ncy","%d",&ncy); + read_line("nmr","%d",&nmr); + fclose(fin); + + printf("bs = %d %d %d %d\n",bs[0],bs[1],bs[2],bs[3]); + printf("ncy = %d\n",ncy); + printf("nmr = %d\n\n",nmr); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [time2.c]", + "Syntax: time2 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_ws(3); + set_sap_parms(bs,0,1,1); + alloc_bgr(SAP_BLOCKS); + + set_sw_parms(0.0123); + mu=0.0785f; + rbb=2.0*(1.0/(double)(bs[0])+1.0/(double)(bs[1])+ + 1.0/(double)(bs[2])+1.0/(double)(bs[3])); + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + assign_ud2ubgr(SAP_BLOCKS); + assign_swd2swbgr(SAP_BLOCKS,NO_PTS); + + ps=reserve_ws(3); + random_s(VOLUME,ps[2],1.0f); + bnd_s2zero(ALL_PTS,ps[2]); + normalize(VOLUME,1,ps[2]); + + nt=(int)(2.0e6/(double)(ncy*nmr*VOLUME)); + if (nt<2) + nt=2; + wdt=0.0; + + while (wdt<5.0) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + for (count=0;count that allows the type of +boundary condition to be chosen at runtime. When the option is not set, open +boundary conditions are assumed. + +The option may be set but has no effect in the case of check1. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sflds/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sflds/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..02b4aff6412743a9f5677c37f3f1bc712518c2b5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sflds/Makefile @@ -0,0 +1,130 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 + +FLAGS = flags lat_parms dfl_parms + +LATTICE = bcnds geometry + +RANDOM = ranlux ranlxs ranlxd gauss + +LINALG = cmatrix_dble liealg salg salg_dble + +UTILS = endian mutils utils wspace + +UFLDS = uflds + +SFLDS = sflds Pbnd Pbnd_dble scom sdcom + +SU3FCTS = su3prod su3ren cm3x3 random_su3 + +MODULES = $(FLAGS) $(LATTICE) $(RANDOM) $(LINALG) $(UTILS) \ + $(UFLDS) $(SFLDS) $(SU3FCTS) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/random:$(MDIR)/linalg:\ + $(MDIR)/utils:$(MDIR)/uflds:$(MDIR)/sflds:$(MDIR)/su3fcts + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sflds/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sflds/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..b9a0f1644855934fe81fa94f5e6eee21d684945c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sflds/check1.c @@ -0,0 +1,339 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2005, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the programs in the module sflds.c. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "linalg.h" +#include "sflds.h" +#include "global.h" + +#define NFLDS 3 + +typedef union +{ + spinor s; + float r[24]; +} spin_t; + +typedef union +{ + spinor_dble s; + double r[24]; +} spin_dble_t; + +static float sig[NFLDS]; +static double sigd[NFLDS]; + + +int main(int argc,char *argv[]) +{ + int my_rank,ie,k,i,ix; + float *r; + double *rd,var,var_all,d,dmax; + spinor **ps; + spinor_dble **psd; + spin_t *sps; + spin_dble_t *spsd; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + printf("\n"); + printf("Check of the programs in the module sflds.c\n"); + printf("-------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + } + + start_ranlux(0,12345); + geometry(); + alloc_ws(2*NFLDS); + alloc_wsd(2*NFLDS); + ps=reserve_ws(2*NFLDS); + psd=reserve_wsd(2*NFLDS); + ie=0; + + for (k=0;k = %.4e (sigma^2 = %.4e)\n", + k,var_all,sig[k]*sig[k]); + } + } + + ie=0; + + for (k=0;k = %.4e (sigma^2 = %.4e)\n", + k,var_all,sigd[k]*sigd[k]); + } + } + + ie=0; + + for (k=0;kdmax) + dmax=d; + } + + if (my_rank==0) + { + printf("\n"); + printf("Relative deviations (should be less than 1.0e-7 or so):\n"); + printf("diff_s2s(): %.1e\n",sqrt(dmax)); + } + + dmax=0.0; + + for (k=0;kdmax) + dmax=d; + } + + if (my_rank==0) + printf("add_s2sd(): %.1e\n",sqrt(dmax)); + + dmax=0.0; + + for (k=0;kdmax) + dmax=d; + } + + if (my_rank==0) + { + printf("diff_sd2s(): %.1e\n\n",sqrt(dmax)); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sflds/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sflds/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..71e53110398d1aa45b1797a0e26a2ffcc4ec05ac --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sflds/check2.c @@ -0,0 +1,768 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the communication programs in scom.c and sdcom.c. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "sflds.h" +#include "linalg.h" +#include "lattice.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) +#define NFLDS 4 + +typedef union +{ + spinor_dble s; + double r[24]; +} spin_dble_t; + +static double p[4]; +static spinor_dble rs ALIGNED16; +static const spinor_dble sd0={{{0.0}}}; + + +static int is_zero_dble(spinor_dble *s) +{ + int i,ie; + spin_dble_t *sp; + + sp=(spin_dble_t*)(s); + ie=1; + + for (i=0;i<24;i++) + ie&=((*sp).r[i]==0.0); + + return ie; +} + + +static int check_int_bnd_dble(spinor_dble *s) +{ + int bc,ix,iy,t; + int ie; + + bc=bc_type(); + ie=1; + + for (ix=0;ix=(VOLUME/2))&&(t==(N0-1))&&((bc==1)||(bc==2))) + { + iy=iup[ix][0]; + ie&=is_zero_dble(s+iy-ix); + } + else + ie&=(is_zero_dble(s)^0x1); + + s+=1; + } + + return ie; +} + + +static int check_ext_bnd_dble(spinor_dble *s) +{ + int bc,ix,t; + int ie; + + bc=bc_type(); + ie=1; + + for (ix=0;ix=VOLUME)&& + ((ifc>1)|| + ((ifc==0)&&((cpr[0]>0)||(bc==3)))|| + ((ifc==1)&&((cpr[0]<(NPROC0-1))||(bc==3))))) + { + pt=p[0]*(double)(x[0]+bo[0]); + pv=p[1]*(double)(x[1]+bo[1])+ + p[2]*(double)(x[2]+bo[2])+ + p[3]*(double)(x[3]+bo[3]); + + if (bc==3) + { + z.re=cos(pt+pv); + z.im=sin(pt+pv); + } + else + { + z.re=sin(pt)*cos(pv); + z.im=sin(pt)*sin(pv); + } + + s[iy].c1=mul_cplx(z,&(rs.c1)); + s[iy].c2=mul_cplx(z,&(rs.c2)); + s[iy].c3=mul_cplx(z,&(rs.c3)); + s[iy].c4=mul_cplx(z,&(rs.c4)); + s[iy]=theta(ifc^is,s+iy); + } + } + } + } + } + } + } + + bnd_sd2zero(EVEN_PTS,s); +} + + +static double check_cpsd_int(int is,spinor_dble *s) +{ + int bc,bo[4]; + int x0,x1,x2,x3,x[4]; + int ix,iy,ifc,mu,i; + double pt,pv,d,dmax; + complex_dble z; + spin_dble_t r,*sp; + + bc=bc_type(); + bo[0]=cpr[0]*L0; + bo[1]=cpr[1]*L1; + bo[2]=cpr[2]*L2; + bo[3]=cpr[3]*L3; + dmax=0.0; + + for (x0=0;x0=VOLUME)&& + ((ifc>1)|| + ((ifc==0)&&((cpr[0]>0)||(bc==3)))|| + ((ifc==1)&&((cpr[0]<(NPROC0-1))||(bc==3))))) + { + pt=p[0]*(double)(x[0]+bo[0]); + pv=p[1]*(double)(x[1]+bo[1])+ + p[2]*(double)(x[2]+bo[2])+ + p[3]*(double)(x[3]+bo[3]); + + if (bc==3) + { + z.re=cos(pt+pv); + z.im=sin(pt+pv); + } + else + { + z.re=sin(pt)*cos(pv); + z.im=sin(pt)*sin(pv); + } + + r.s.c1=mul_cplx(z,&(rs.c1)); + r.s.c2=mul_cplx(z,&(rs.c2)); + r.s.c3=mul_cplx(z,&(rs.c3)); + r.s.c4=mul_cplx(z,&(rs.c4)); + sp=(spin_dble_t*)(s+iy); + + for (i=0;i<18;i++) + r.r[i]-=(*sp).r[i]; + + r.s=theta((ifc^0x1)^is,&(r.s)); + + for (i=0;i<18;i++) + { + d=fabs(r.r[i]); + if (d>dmax) + dmax=d; + } + } + } + } + } + } + } + } + + if (NPROC>1) + { + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return dmax; +} + + +static double check_cpsd_ext(int is,spinor_dble *s) +{ + int bc,bo[4]; + int x0,x1,x2,x3; + int ix,iy,ifc,mu,i; + double pt,pv,d,dmax; + complex_dble z; + spin_dble_t r,*sp; + + bc=bc_type(); + bo[0]=cpr[0]*L0; + bo[1]=cpr[1]*L1; + bo[2]=cpr[2]*L2; + bo[3]=cpr[3]*L3; + dmax=0.0; + + for (x0=0;x0=VOLUME)&& + ((ifc>1)|| + ((ifc==0)&&((cpr[0]>0)||(bc==3)))|| + ((ifc==1)&&((cpr[0]<(NPROC0-1))||(bc==3))))) + { + pt=p[0]*(double)(x0+bo[0]); + pv=p[1]*(double)(x1+bo[1])+ + p[2]*(double)(x2+bo[2])+ + p[3]*(double)(x3+bo[3]); + + if (bc==3) + { + z.re=cos(pt+pv); + z.im=sin(pt+pv); + } + else + { + z.re=sin(pt)*cos(pv); + z.im=sin(pt)*sin(pv); + } + + r.s.c1=mul_cplx(z,&(rs.c1)); + r.s.c2=mul_cplx(z,&(rs.c2)); + r.s.c3=mul_cplx(z,&(rs.c3)); + r.s.c4=mul_cplx(z,&(rs.c4)); + r.s=theta((ifc^0x1)^is,&(r.s)); + + for (i=0;i<18;i++) + (*sp).r[i]-=r.r[i]; + } + } + + for (i=0;i<18;i++) + { + d=fabs((*sp).r[i]); + if (d>dmax) + dmax=d; + } + } + else + { + for (i=0;i<18;i++) + { + d=fabs((*sp).r[i]); + if (d>dmax) + dmax=d; + } + } + } + } + } + } + + if (NPROC>1) + { + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return dmax; +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,ie,is,k; + double phi[2],phi_prime[2]; + double cG,cG_prime,cF,cF_prime; + double d,dmax; + spinor **ps; + spinor_dble **psd; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + printf("\n"); + printf(" Check of the communication programs in scom.c and sdcom.c\n"); + printf("----------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check2.c]", + "Syntax: check2 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + cG=0.97; + cG_prime=1.056; + cF=0.82; + cF_prime=1.12; + set_bc_parms(bc,cG,cG_prime,cF,cF_prime,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_ws(NFLDS); + alloc_wsd(NFLDS); + + ps=reserve_ws(NFLDS); + psd=reserve_wsd(NFLDS); + dmax=0.0; + + for (is=0;is<2;is++) + { + for (k=0;kdmax) + dmax=d; + + random_sd(NSPIN,psd[k],1.0); + assign_sd2s(NSPIN,psd[k],ps[k]); + d=(double)(norm_square(NSPIN,1,ps[k])); + cps_ext_bnd(is,ps[k]); + cpsd_ext_bnd(is,psd[k]); + assign_sd2s(NSPIN,psd[k],ps[k+1]); + mulr_spinor_add(NSPIN,ps[k],ps[k+1],-1.0f); + d=(double)(norm_square(NSPIN,1,ps[k]))/d; + d=sqrt(d); + if (d>dmax) + dmax=d; + } + } + + if (my_rank==0) + { + printf("Maximal relative deviation single-/double-precision programs" + " = %.1e\n",dmax); + printf("Now checking double-precision programs:\n"); + } + + ie=1; + + for (is=0;is<2;is++) + { + for (k=0;kdmax) + dmax=d; + d=check_cpsd_int(is,psd[0]); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + printf("Maximal deviation (cpsd_int_bnd) = %.1e\n",dmax); + + dmax=0.0; + + for (is=0;is<2;is++) + { + random_sd(NSPIN,psd[0],1.0); + set_sd_bnd(is,psd[0]); + assign_sd2sd(NSPIN,psd[0],psd[1]); + cpsd_ext_bnd(is,psd[0]); + mulr_spinor_add_dble(NSPIN-VOLUME,psd[1]+VOLUME,psd[0]+VOLUME,-1.0); + d=norm_square_dble(NSPIN-VOLUME,1,psd[1]+VOLUME); + if (d>dmax) + dmax=d; + mulr_spinor_add_dble(VOLUME,psd[0],psd[1],-1.0); + d=check_cpsd_ext(is,psd[0]); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + printf("Maximal deviation (cpsd_ext_bnd) = %.1e\n\n",dmax); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..cb78e01ea2be290c4b24decbdb5389c6212046e8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/INDEX @@ -0,0 +1,14 @@ + +Calculation of the Sheikholeslami-Wohlert term + +check1 Allocation, assignment and inversion of the global SW arrays. + +check2 Check of the gauge covariance of the SW term. + +check3 Check of the SW term for abelian background fields. + +time1 Timing of the program sw_term(). + +All programs accept the option -bc that allows the type of boundary +condition to be chosen. When the option is not set, open boundary conditions +are assumed. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..57cf393b761576cd861acc339ef6714fe5f37834 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/Makefile @@ -0,0 +1,136 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 time1 + +FLAGS = flags lat_parms dfl_parms + +LATTICE = bcnds uidx ftidx geometry + +LINALG = salg_dble liealg cmatrix_dble + +RANDOM = ranlux ranlxs ranlxd gauss + +UFLDS = plaq_sum shift uflds udcom + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +UTILS = endian mutils utils wspace + +SFLDS = sflds + +TCHARGE = ftcom ftensor + +SW_TERM = pauli pauli_dble swflds sw_term + + +MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(RANDOM) $(UFLDS) \ + $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/random:\ + $(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:$(MDIR)/sflds:\ + $(MDIR)/tcharge:$(MDIR)/sw_term + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..faf6f2a16be67e882d9e5b8ae6e835398ad6e90e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/check1.c @@ -0,0 +1,403 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2005, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Allocation, assignment and inversion of the global SW arrays. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sw_term.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +typedef union +{ + weyl_dble w; + complex_dble c[6]; +} spin_dble_t; + +static pauli_dble *sswd=NULL; +static spin_dble_t vd ALIGNED32; +static const weyl_dble vd0={{{0.0}}}; + + +static void save_swd(void) +{ + pauli_dble *pa,*pb,*pm; + + if (sswd==NULL) + { + sswd=amalloc(2*VOLUME*sizeof(*sswd),ALIGN); + error(sswd==NULL,1,"save_swd [check1.c]", + "Unable to allocate auxiliary array"); + } + + pa=swdfld(); + pb=sswd; + pm=pa+2*VOLUME; + + for (;padmax) + dmax=d; + } + + pb+=1; + } + + return dmax; +} + + +static double cmp_iswd(ptset_t set) +{ + int k,l; + double d,dmax; + pauli_dble *pa,*pb,*pm; + + pa=swdfld(); + pb=sswd; + pm=pa; + + if (set==EVEN_PTS) + pm=pa+VOLUME; + else if (set==ODD_PTS) + { + pa+=VOLUME; + pb+=VOLUME; + pm=pa+VOLUME; + } + else if (set==ALL_PTS) + pm=pa+2*VOLUME; + + dmax=0.0; + + for (;padmax) + dmax=d; + } + } + + pb+=1; + } + + return sqrt(dmax); +} + + +static double cmp_sw2swd(ptset_t set) +{ + int k; + double d,dmax; + pauli *pa,*pm; + pauli_dble *pb; + + pa=swfld(); + pb=swdfld(); + pm=pa; + + if (set==EVEN_PTS) + pm=pa+VOLUME; + else if (set==ODD_PTS) + { + pa+=VOLUME; + pb+=VOLUME; + pm=pa+VOLUME; + } + else if (set==ALL_PTS) + pm=pa+2*VOLUME; + + dmax=0.0; + + for (;padmax) + dmax=d; + } + + pb+=1; + } + + return dmax; +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,ix,ie; + double phi[2],phi_prime[2]; + double d,dmax; + pauli *sw; + pauli_dble *swd; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + printf("\n"); + printf("Initialization and inversion of the global SW arrays\n"); + printf("----------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check1.c]", + "Syntax: check1 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.301,0.789,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + + set_sw_parms(-0.0123); + sw=swfld(); + swd=swdfld(); + ie=1; + + for (ix=0;ix<(2*VOLUME);ix++) + { + ie|=is_unity(sw); + ie|=is_unity_dble(swd); + sw+=1; + swd+=1; + } + + error(ie!=1,1,"main [check1.c]","SW fields are not correctly initialized"); + + print_flags(); + random_ud(); + sw_term(NO_PTS); + ie=check_swbnd(); + error(ie!=1,1,"main [check1.c]","SW field has incorrect boundary values"); + save_swd(); + + chs_ubnd(-1); + sw_term(NO_PTS); + d=cmp_swd(ALL_PTS); + error(d!=0.0,1,"main [check1.c]", + "SW term changed after calling chs_ubnd(-1)"); + + ie=sw_term(EVEN_PTS); + error(ie!=0,1,"main [check1.c]","Unsafe inversion of swd_e"); + d=cmp_iswd(EVEN_PTS); + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Inverted swd_e\n"); + printf("Maximal deviation of swd_e = %.1e\n",dmax); + } + + d=cmp_swd(ODD_PTS); + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + printf("Maximal deviation of swd_o = %.1e\n\n",dmax); + + print_flags(); + random_ud(); + sw_term(NO_PTS); + save_swd(); + + ie=sw_term(ODD_PTS); + error(ie!=0,1,"main [check1.c]","Unsafe inversion of swd_o"); + d=cmp_swd(EVEN_PTS); + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Inverted swd_o\n"); + printf("Maximal deviation of swd_e = %.1e\n",dmax); + } + + d=cmp_iswd(ODD_PTS); + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + printf("Maximal deviation of swd_o = %.1e\n\n",dmax); + + print_flags(); + assign_swd2sw(); + d=cmp_sw2swd(ALL_PTS); + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Assigned swd to sw\n"); + printf("Maximal deviation = %.1e\n\n",dmax); + } + + print_flags(); + random_ud(); + sw_term(NO_PTS); + save_swd(); + + ie=sw_term(ALL_PTS); + error(ie!=0,1,"main [check1.c]","Unsafe inversion of swd"); + d=cmp_iswd(ALL_PTS); + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Inverted swd\n"); + printf("Maximal deviation = %.1e\n\n",dmax); + } + + print_flags(); + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..c9f91cbdcd84793fedaaec012fe4cd1fdbac5d26 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/check2.c @@ -0,0 +1,366 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2005, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the gauge covariance of the SW term. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +static int bc,nfc[8],ofs[8]; +static const su3_dble ud0={{0.0}}; +static su3_dble *g,*gbuf; +static su3_dble wd ALIGNED16; + + +static void pack_gbuf(void) +{ + int ifc,ib,ix; + + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=0; + ofs[1]=ofs[0]+nfc[0]; + ofs[2]=ofs[1]+nfc[1]; + ofs[3]=ofs[2]+nfc[2]; + ofs[4]=ofs[3]+nfc[3]; + ofs[5]=ofs[4]+nfc[4]; + ofs[6]=ofs[5]+nfc[5]; + ofs[7]=ofs[6]+nfc[6]; + + for (ifc=0;ifc<8;ifc++) + { + for (ib=0;ib0) + { + tag=mpi_tag(); + saddr=npr[ifc^0x1]; + raddr=npr[ifc]; + sbuf=gbuf+ofs[ifc]; + rbuf=g+VOLUME+ofs[ifc]; + + if (np&0x1) + { + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + } + } +} + + +static void random_g(void) +{ + int ix,t; + su3_dble unity,*gx; + + unity=ud0; + unity.c11.re=1.0; + unity.c22.re=1.0; + unity.c33.re=1.0; + gx=g; + + for (ix=0;ix0)||(bc!=1)) + random_su3_dble(gx); + else + (*gx)=unity; + + gx+=1; + } + + if (BNDRY>0) + { + pack_gbuf(); + send_gbuf(); + } +} + + +static void transform_ud(void) +{ + int ix,iy,t,ifc; + su3_dble *u; + + u=udfld(); + + for (ix=(VOLUME/2);ix]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + alloc_wsd(4); + psd=reserve_wsd(4); + + g=amalloc(NSPIN*sizeof(*g),4); + if (BNDRY!=0) + gbuf=amalloc((BNDRY/2)*sizeof(*gbuf),4); + + error((g==NULL)||((BNDRY!=0)&&(gbuf==NULL)),1,"main [check2.c]", + "Unable to allocate auxiliary arrays"); + + swp=set_sw_parms(-0.0123); + + if (my_rank==0) + printf("m0 = %.4e, csw = %.4e, cF = %.4e, cF' = %.4e\n\n", + swp.m0,swp.csw,swp.cF[0],swp.cF[1]); + + random_g(); + random_ud(); + + for (i=0;i<4;i++) + random_sd(VOLUME,psd[i],1.0); + + (void)sw_term(NO_PTS); + sw=swdfld(); + apply_sw_dble(VOLUME,0.789,sw,psd[0],psd[1]); + + transform_sd(psd[0],psd[2]); + transform_ud(); + (void)sw_term(NO_PTS); + sw=swdfld(); + apply_sw_dble(VOLUME,0.789,sw,psd[2],psd[3]); + transform_sd(psd[1],psd[2]); + + mulr_spinor_add_dble(VOLUME,psd[3],psd[2],-1.0); + d=norm_square_dble(VOLUME,1,psd[3])/norm_square_dble(VOLUME,1,psd[0]); + error_chk(); + + if (my_rank==0) + { + printf("Maximal normalized difference = %.2e\n",sqrt(d)); + printf("(should be around 1*10^(-15) or so)\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..f1c4e0462269a0869be2ea0f68bbd19d9f21f225 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/check3.c @@ -0,0 +1,585 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2005, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the SW term for abelian background fields. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,np[4]; +static double t[3],a[4],p[4],inp[4]; +static double (*Fhat)[3]; +static const su3_dble ud0={{0.0}}; +static spinor_dble ws; + + +static void alloc_Fhat(void) +{ + Fhat=amalloc(VOLUME*sizeof(*Fhat),3); + + error(Fhat==NULL,1,"alloc_Fhat [check3.c]", + "Unable to allocate auxiliary array"); +} + + +static void set_parms(void) +{ + int my_rank; + double pi; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + gauss_dble(t,2); + t[2]=-t[0]-t[1]; + + ranlxd(a,4); + + np[0]=(int)(a[0]*(double)(N0)); + np[1]=(int)(a[1]*(double)(N1)); + np[2]=(int)(a[2]*(double)(N2)); + np[3]=(int)(a[3]*(double)(N3)); + + pi=4.0*atan(1.0); + + p[0]=(double)(np[0])*2.0*pi/(double)(N0); + p[1]=(double)(np[1])*2.0*pi/(double)(N1); + p[2]=(double)(np[2])*2.0*pi/(double)(N2); + p[3]=(double)(np[3])*2.0*pi/(double)(N3); + + gauss_dble(a,4); + } + + MPI_Bcast(t,3,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(a,4,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(p,4,MPI_DOUBLE,0,MPI_COMM_WORLD); + + inp[0]=1.0/(double)(N0); + inp[1]=1.0/(double)(N1); + inp[2]=1.0/(double)(N2); + inp[3]=1.0/(double)(N3); +} + + +static double afld(int *x,int mu) +{ + double xt[4],px; + + xt[0]=(double)(safe_mod(x[0],N0)); + xt[1]=(double)(safe_mod(x[1],N1)); + xt[2]=(double)(safe_mod(x[2],N2)); + xt[3]=(double)(safe_mod(x[3],N3)); + + px=p[0]*xt[0]+p[1]*xt[1]+p[2]*xt[2]+p[3]*xt[3]; + + return a[mu]*sin(px); +} + + +static void ftplaq(int *x,int mu,int nu,double *ftp) +{ + double sm,om[3],*phi; + bc_parms_t bcp; + + bcp=bc_parms(); + + if ((x[0]==0)&&(mu==0)&&(bc==1)) + { + sm=afld(x,mu); + x[mu]+=1; + sm+=afld(x,nu); + x[mu]-=1; + x[nu]+=1; + sm-=afld(x,mu); + x[nu]-=1; + + phi=bcp.phi[0]; + om[0]=t[0]*sm-phi[0]*inp[nu]; + om[1]=t[1]*sm-phi[1]*inp[nu]; + om[2]=t[2]*sm-phi[2]*inp[nu]; + } + else if ((x[0]==(N0-1))&&(mu==0)&&((bc==1)||(bc==2))) + { + sm=afld(x,mu)-afld(x,nu); + x[nu]+=1; + sm-=afld(x,mu); + x[nu]-=1; + + phi=bcp.phi[1]; + om[0]=t[0]*sm+phi[0]*inp[nu]; + om[1]=t[1]*sm+phi[1]*inp[nu]; + om[2]=t[2]*sm+phi[2]*inp[nu]; + } + else + { + sm=afld(x,mu)-afld(x,nu); + x[mu]+=1; + sm+=afld(x,nu); + x[mu]-=1; + x[nu]+=1; + sm-=afld(x,mu); + x[nu]-=1; + + om[0]=t[0]*sm; + om[1]=t[1]*sm; + om[2]=t[2]*sm; + } + + ftp[0]=sin(om[0]); + ftp[1]=sin(om[1]); + ftp[2]=sin(om[2]); +} + + +static void set_ud(void) +{ + int bo[4],x[4]; + int x0,x1,x2,x3,ix,ifc,mu; + double r1,r2; + su3_dble *udb,*u; + + udb=udfld(); + bo[0]=cpr[0]*L0; + bo[1]=cpr[1]*L1; + bo[2]=cpr[2]*L2; + bo[3]=cpr[3]*L3; + + for (x0=0;x0=(VOLUME/2)) + { + u=udb+8*(ix-(VOLUME/2)); + + for (ifc=0;ifc<8;ifc++) + { + mu=ifc/2; + if (ifc&0x1) + x[mu]-=1; + r1=afld(x,mu); + if (ifc&0x1) + x[mu]+=1; + r2=t[0]*r1; + (*u)=ud0; + (*u).c11.re=cos(r2); + (*u).c11.im=sin(r2); + r2=t[1]*r1; + (*u).c22.re=cos(r2); + (*u).c22.im=sin(r2); + r2=t[2]*r1; + (*u).c33.re=cos(r2); + (*u).c33.im=sin(r2); + u+=1; + } + } + } + } + } + } + + set_bc(); + set_flags(UPDATED_UD); +} + + +static void compute_Fhat(int mu,int nu) +{ + int bo[4],x[4]; + int x0,x1,x2,x3,ix; + double ftp[4][3]; + + bo[0]=cpr[0]*L0; + bo[1]=cpr[1]*L1; + bo[2]=cpr[2]*L2; + bo[3]=cpr[3]*L3; + + for (x0=0;x00)&&((x[0]<(N0-1))||(bc!=0)))||(bc==3)) + { + ftplaq(x,mu,nu,ftp[0]); + x[mu]-=1; + ftplaq(x,mu,nu,ftp[1]); + x[nu]-=1; + ftplaq(x,mu,nu,ftp[2]); + x[mu]+=1; + ftplaq(x,mu,nu,ftp[3]); + + Fhat[ix][0]=0.25*(ftp[0][0]+ftp[1][0]+ftp[2][0]+ftp[3][0]); + Fhat[ix][1]=0.25*(ftp[0][1]+ftp[1][1]+ftp[2][1]+ftp[3][1]); + Fhat[ix][2]=0.25*(ftp[0][2]+ftp[1][2]+ftp[2][2]+ftp[3][2]); + } + else + { + Fhat[ix][0]=0.0; + Fhat[ix][1]=0.0; + Fhat[ix][2]=0.0; + } + } + } + } + } +} + + +static su3_vector_dble mul_cplx(complex_dble z,su3_vector_dble s) +{ + su3_vector_dble r; + + r.c1.re=z.re*s.c1.re-z.im*s.c1.im; + r.c1.im=z.im*s.c1.re+z.re*s.c1.im; + r.c2.re=z.re*s.c2.re-z.im*s.c2.im; + r.c2.im=z.im*s.c2.re+z.re*s.c2.im; + r.c3.re=z.re*s.c3.re-z.im*s.c3.im; + r.c3.im=z.im*s.c3.re+z.re*s.c3.im; + + return r; +} + + +static spinor_dble mul_gamma(int mu,spinor_dble s) +{ + spinor_dble r; + complex_dble i,m_i,m_1; + + i.re=0.0; + i.im=1.0; + + m_i.re=0.0; + m_i.im=-1.0; + + m_1.re=-1.0; + m_1.im=0.0; + + if (mu==0) + { + r.c1=mul_cplx(m_1,s.c3); + r.c2=mul_cplx(m_1,s.c4); + r.c3=mul_cplx(m_1,s.c1); + r.c4=mul_cplx(m_1,s.c2); + } + else if (mu==1) + { + r.c1=mul_cplx(m_i,s.c4); + r.c2=mul_cplx(m_i,s.c3); + r.c3=mul_cplx(i,s.c2); + r.c4=mul_cplx(i,s.c1); + } + else if (mu==2) + { + r.c1=mul_cplx(m_1,s.c4); + r.c2=s.c3; + r.c3=s.c2; + r.c4=mul_cplx(m_1,s.c1); + } + else if (mu==3) + { + r.c1=mul_cplx(m_i,s.c3); + r.c2=mul_cplx(i,s.c4); + r.c3=mul_cplx(i,s.c1); + r.c4=mul_cplx(m_i,s.c2); + } + else + { + r.c1=s.c1; + r.c2=s.c2; + r.c3=mul_cplx(m_1,s.c3); + r.c4=mul_cplx(m_1,s.c4); + } + + return r; +} + + +static spinor_dble mul_sigma(int mu,int nu,spinor_dble s) +{ + complex_dble z; + spinor_dble r1,r2; + + r1=mul_gamma(nu,s); + r1=mul_gamma(mu,r1); + + r2=mul_gamma(mu,s); + r2=mul_gamma(nu,r2); + + _vector_sub_assign(r1.c1,r2.c1); + _vector_sub_assign(r1.c2,r2.c2); + _vector_sub_assign(r1.c3,r2.c3); + _vector_sub_assign(r1.c4,r2.c4); + + z.re=0.0; + z.im=0.5; + _vector_mulc(r2.c1,z,r1.c1); + _vector_mulc(r2.c2,z,r1.c2); + _vector_mulc(r2.c3,z,r1.c3); + _vector_mulc(r2.c4,z,r1.c4); + + return r2; +} + + +static void muladd_pauli(double csw,int mu,int nu, + spinor_dble *pk,spinor_dble *pl) +{ + int ix; + double r; + + compute_Fhat(mu,nu); + + csw=(-0.25)*csw; + + for (ix=0;ix]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.301,0.789,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_Fhat(); + alloc_wsd(3); + psd=reserve_wsd(3); + + set_sw_parms(-0.0123); + swp=sw_parms(); + dmax=0.0; + + if (my_rank==0) + printf("m0=%.4e, csw=%.4e, cF=%.4e, cF'=%.4e\n\n", + swp.m0,swp.csw,swp.cF[0],swp.cF[1]); + + for (n=0;n<4;n++) + { + set_parms(); + set_ud(); + (void)sw_term(NO_PTS); + sw=swdfld(); + + random_sd(VOLUME,psd[0],1.0); + apply_sw_dble(VOLUME,0.0,sw,psd[0],psd[1]); + mul_swd(swp.m0,swp.csw,psd[0],psd[2]); + bnd_corr(swp.cF,psd[0],psd[2]); + + mulr_spinor_add_dble(VOLUME,psd[2],psd[1],-1.0); + d=norm_square_dble(VOLUME,1,psd[2])/norm_square_dble(VOLUME,1,psd[0]); + d=sqrt(d); + if (d>dmax) + dmax=d; + + if (my_rank==0) + { + printf("Field number %d:\n",n+1); + printf("The parameters are:\n"); + printf("t=%.2f,%.2f,%.2f, a=%.2f,%.2f,%.2f,%.2f, ", + t[0],t[1],t[2],a[0],a[1],a[2],a[3]); + printf("np=%d,%d,%d,%d\n",np[0],np[1],np[2],np[3]); + printf("delta = %.2e\n\n",d); + } + } + + error_chk(); + + if (my_rank==0) + { + printf("Maximal deviation = %.1e\n\n",dmax); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/time1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/time1.c new file mode 100644 index 0000000000000000000000000000000000000000..f7eef6c947cdb7a808353c84dee0cd8233a32acb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/time1.c @@ -0,0 +1,145 @@ + +/******************************************************************************* +* +* File time1.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of the program sw_term(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sw_term.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,count,nt; + double phi[2],phi_prime[2]; + double wt1,wt2,wdt; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time1.log","w",stdout); + printf("\n"); + printf("Timing of the program sw_term()\n"); + printf("-------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + +#if (defined AVX) + printf("Using AVX instructions\n\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [time1.c]", + "Syntax: time1 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.301,0.789,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + set_sw_parms(-0.0123); + random_ud(); + + nt=(int)(5.0e5/(double)(VOLUME)); + if (nt<2) + nt=2; + wdt=0.0; + + while (wdt<5.0) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + for (count=0;count that allows the type of boundary +condition to be chosen. When the option is not set, open boundary conditions +are assumed. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..4828aa1177fbc44147c2451a109866e863f25686 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/Makefile @@ -0,0 +1,141 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 check4 check5 check6 + +FORCES = force0 + +FLAGS = flags lat_parms hmc_parms dfl_parms + +LATTICE = bcnds uidx ftidx geometry + +LINALG = liealg cmatrix_dble + +MDFLDS = mdflds fcom + +RANDOM = ranlux ranlxs ranlxd gauss random_su3 + +SFLDS = sflds + +SU3FCTS = chexp su3prod su3ren cm3x3 + +TCHARGE = ftcom ftensor tcharge ym_action + +UFLDS = plaq_sum shift uflds udcom bstap + +UTILS = endian mutils utils wspace + +WFLOW = wflow + +MODULES = $(FLAGS) $(FORCES) $(LATTICE) $(LINALG) $(MDFLDS) \ + $(RANDOM) $(SFLDS) $(SU3FCTS) $(TCHARGE) $(UFLDS) \ + $(UTILS) $(WFLOW) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/forces:$(MDIR)/lattice:$(MDIR)/linalg:\ + $(MDIR)/mdflds:$(MDIR)/random:$(MDIR)/su3fcts:$(MDIR)/sflds:\ + $(MDIR)/su3fcts:$(MDIR)/uflds:$(MDIR)/utils:$(MDIR)/tcharge:\ + $(MDIR)/wflow + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..b42cf3ad8d1bb37e155cc42a520bc5fad1d4ca3a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check1.c @@ -0,0 +1,375 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2009-2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the gauge and translation invariance of the topological charge. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "su3fcts.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "tcharge.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,nfc[8],ofs[8]; +static const su3_dble ud0={{0.0}}; +static su3_dble *g,*gbuf; +static su3_dble wd ALIGNED16; + + +static void pack_gbuf(void) +{ + int ifc,ib,ix; + + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=0; + ofs[1]=ofs[0]+nfc[0]; + ofs[2]=ofs[1]+nfc[1]; + ofs[3]=ofs[2]+nfc[2]; + ofs[4]=ofs[3]+nfc[3]; + ofs[5]=ofs[4]+nfc[4]; + ofs[6]=ofs[5]+nfc[5]; + ofs[7]=ofs[6]+nfc[6]; + + for (ifc=0;ifc<8;ifc++) + { + for (ib=0;ib0) + { + tag=mpi_tag(); + saddr=npr[ifc^0x1]; + raddr=npr[ifc]; + sbuf=gbuf+ofs[ifc]; + rbuf=g+VOLUME+ofs[ifc]; + + if (np&0x1) + { + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + } + } +} + + +static void random_g(void) +{ + int ix,t; + su3_dble unity,*gx; + + unity=ud0; + unity.c11.re=1.0; + unity.c22.re=1.0; + unity.c33.re=1.0; + gx=g; + + for (ix=0;ix0)||(bc!=1)) + random_su3_dble(gx); + else + (*gx)=unity; + + gx+=1; + } + + if (BNDRY>0) + { + pack_gbuf(); + send_gbuf(); + } +} + + +static void transform_ud(void) +{ + int ix,iy,t,ifc; + su3_dble *u; + + u=udfld(); + + for (ix=(VOLUME/2);ix(bs[mu]/2)) + svec[mu]-=bs[mu]; + } + + MPI_Bcast(svec,4,MPI_INT,0,MPI_COMM_WORLD); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,i,s[4]; + double phi[2],phi_prime[2]; + double d,dmax1,dmax2; + double Q1,Q2,q1,q2; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + printf("\n"); + printf("Gauge and translation invariance of the topological charge\n"); + printf("----------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check1.c]", + "Syntax: check1 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + + g=amalloc(NSPIN*sizeof(*g),4); + + if (BNDRY>0) + gbuf=amalloc((BNDRY/2)*sizeof(*gbuf),4); + + error((g==NULL)||((BNDRY>0)&&(gbuf==NULL)),1,"main [check1.c]", + "Unable to allocate auxiliary arrays"); + + dmax1=0.0; + dmax2=0.0; + + for (i=0;i<8;i++) + { + random_ud(); + + Q1=tcharge(); + random_vec(s); + if (bc!=3) + s[0]=0; + shift_ud(s); + Q2=tcharge(); + + d=fabs(Q1-Q2); + if (d>dmax1) + dmax1=d; + + random_g(); + transform_ud(); + Q2=tcharge(); + + d=fabs(Q1-Q2); + if (d>dmax2) + dmax2=d; + + q1=Q1; + q2=Q2; + + MPI_Bcast(&q1,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&q2,1,MPI_INT,0,MPI_COMM_WORLD); + + error((q1!=Q1)||(q2!=Q2),1,"main [check1.c]", + "Charge is not globally the same"); + } + + error_chk(); + print_flags(); + + if (my_rank==0) + { + printf("Translation invariance = %.2e\n",dmax1); + printf("Gauge invariance = %.2e\n\n",dmax2); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..8759e74dccf6775b3076189fb931196931307797 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check2.c @@ -0,0 +1,463 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2009-2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Topological charge of constant abelian background fields. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "su3fcts.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "tcharge.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,np[4],bo[4]; +static double mt[4][4],inp[4],twopi; +static su3_dble ud0={{0.0}}; + + +static double afld(int *x,int mu) +{ + int nu; + double xt[4],phi; + + xt[0]=(double)(safe_mod(x[0],N0)); + xt[1]=(double)(safe_mod(x[1],N1)); + xt[2]=(double)(safe_mod(x[2],N2)); + xt[3]=(double)(safe_mod(x[3],N3)); + + phi=0.0; + + for (nu=0;nu1) + { + MPI_Reduce(&qloc,&qall,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&qall,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + return qall; + } + else + return qloc; +} + + +static double Qmt(void) +{ + int i,mu,nu,ro,si; + double sm,phi,tr; + double ft1,ft2,ft3,fs1,fs2,fs3; + + sm=0.0; + mu=0; + nu=1; + ro=2; + si=3; + + for (i=0;i<3;i++) + { + phi=twopi*mt[mu][nu]*inp[mu]*inp[nu]; + + ft1=sin(phi); + ft2=ft1; + ft3=-sin(2.0*phi); + + tr=(ft1+ft2+ft3)/3.0; + + ft1-=tr; + ft2-=tr; + ft3-=tr; + + phi=twopi*mt[ro][si]*inp[ro]*inp[si]; + + fs1=sin(phi); + fs2=fs1; + fs3=-sin(2.0*phi); + + tr=(fs1+fs2+fs3)/3.0; + + fs1-=tr; + fs2-=tr; + fs3-=tr; + + sm+=(ft1*fs1+ft2*fs2+ft3*fs3); + + nu=nu+1; + ro=(ro+1)%4+(ro==3); + si=(si+1)%4+(si==3); + } + + sm/=(twopi*twopi); + + if (bc==0) + sm*=(double)((N0-2)*N1)*(double)(N2*N3); + else if (bc==1) + { + sm*=(double)((N0-3)*N1)*(double)(N2*N3); + sm+=Qtbnd(); + } + else if (bc==2) + { + sm*=(double)((N0-2)*N1)*(double)(N2*N3); + sm+=Qtbnd(); + } + else + sm*=(double)(N0*N1)*(double)(N2*N3); + + return sm; +} + + +static void choose_mt(void) +{ + int mu,nu; + double r[6]; + + ranlxd(r,6); + MPI_Bcast(r,6,MPI_DOUBLE,0,MPI_COMM_WORLD); + + mt[0][1]=(double)((int)(3.0*r[0])-1); + mt[0][2]=(double)((int)(3.0*r[1])-1); + mt[0][3]=(double)((int)(3.0*r[2])-1); + mt[1][2]=(double)((int)(3.0*r[3])-1); + mt[1][3]=(double)((int)(3.0*r[4])-1); + mt[2][3]=(double)((int)(3.0*r[5])-1); + + for (mu=0;mu<4;mu++) + { + mt[mu][mu]=0.0; + + for (nu=0;nu=(VOLUME/2)) + { + x[0]=bo[0]+x0; + x[1]=bo[1]+x1; + x[2]=bo[2]+x2; + x[3]=bo[3]+x3; + + u=udb+8*(ix-(VOLUME/2)); + + for (ifc=0;ifc<8;ifc++) + { + if (ifc&0x1) + x[ifc/2]-=1; + + phi=afld(x,ifc/2); + + if (ifc&0x1) + x[ifc/2]+=1; + + (*u)=ud0; + (*u).c11.re=cos(phi); + (*u).c11.im=sin(phi); + (*u).c22.re=(*u).c11.re; + (*u).c22.im=(*u).c11.im; + (*u).c33.re=cos(-2.0*phi); + (*u).c33.im=sin(-2.0*phi); + u+=1; + } + } + } + } + } + } + + set_bc(); + set_flags(UPDATED_UD); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,i; + double phi[2],phi_prime[2]; + double Q1,Q2,d,dmax; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + printf("\n"); + printf("Topological charge of constant abelian background fields\n"); + printf("--------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check2.c]", + "Syntax: check2 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.9012,1.2034,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123); + geometry(); + + twopi=8.0*atan(1.0); + + np[0]=N0; + np[1]=N1; + np[2]=N2; + np[3]=N3; + + bo[0]=cpr[0]*L0; + bo[1]=cpr[1]*L1; + bo[2]=cpr[2]*L2; + bo[3]=cpr[3]*L3; + + inp[0]=1.0/(double)(np[0]); + inp[1]=1.0/(double)(np[1]); + inp[2]=1.0/(double)(np[2]); + inp[3]=1.0/(double)(np[3]); + + dmax=0.0; + + for (i=0;i<10;i++) + { + choose_mt(); + set_ud(); + Q1=Qmt(); + Q2=tcharge(); + + if (my_rank==0) + printf("Field no = %2d, Q1 = % 8.4e, Q2 = % 8.4e\n",i+1,Q1,Q2); + + d=fabs(Q1-Q2); + if (d>dmax) + dmax=d; + } + + error_chk(); + + if (my_rank==0) + { + printf("\n"); + printf("Maximal absolute deviation = %.1e\n\n",dmax); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..a730fd4ea6919e39e9a6a5d438b4cb1f4d3b703c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check3.c @@ -0,0 +1,155 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2009-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the program tcharge_slices(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "forces.h" +#include "wflow.h" +#include "tcharge.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,n,dn; +static double eps,Q1,Q2,Q[N0],Q0[N0]; + + +int main(int argc,char *argv[]) +{ + int my_rank,i,imax,t; + double phi[2],phi_prime[2]; + double nplaq,act,dev; + FILE *fin=NULL,*flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check3.log","w",stdout); + fin=freopen("check3.in","r",stdin); + + printf("\n"); + printf("Check of the program tcharge_slices()\n"); + printf("-------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("n","%d",&n); + read_line("dn","%d",&dn); + read_line("eps","%lf",&eps); + fclose(fin); + + printf("n = %d\n",n); + printf("dn = %d\n",dn); + printf("eps = %.2e\n\n",eps); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check3.c]", + "Syntax: check3 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&n,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&dn,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&eps,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + set_lat_parms(6.0,1.0,0,NULL,1.0); + + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.9012,1.2034,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + alloc_wfd(2); + + if (bc==0) + nplaq=(double)(6*N0-6)*(double)(N1*N2*N3); + else + nplaq=(double)(6*N0)*(double)(N1*N2*N3); + + random_ud(); + imax=n/dn; + + for (i=0;i +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "su3fcts.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "tcharge.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,nfc[8],ofs[8]; +static const su3_dble ud0={{0.0}}; +static su3_dble *g,*gbuf; +static su3_dble wd ALIGNED16; + + +static void pack_gbuf(void) +{ + int ifc,ib,ix; + + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=0; + ofs[1]=ofs[0]+nfc[0]; + ofs[2]=ofs[1]+nfc[1]; + ofs[3]=ofs[2]+nfc[2]; + ofs[4]=ofs[3]+nfc[3]; + ofs[5]=ofs[4]+nfc[4]; + ofs[6]=ofs[5]+nfc[5]; + ofs[7]=ofs[6]+nfc[6]; + + for (ifc=0;ifc<8;ifc++) + { + for (ib=0;ib0) + { + tag=mpi_tag(); + saddr=npr[ifc^0x1]; + raddr=npr[ifc]; + sbuf=gbuf+ofs[ifc]; + rbuf=g+VOLUME+ofs[ifc]; + + if (np&0x1) + { + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + } + } +} + + +static void random_g(void) +{ + int ix,t; + su3_dble unity,*gx; + + unity=ud0; + unity.c11.re=1.0; + unity.c22.re=1.0; + unity.c33.re=1.0; + gx=g; + + for (ix=0;ix0)||(bc!=1)) + random_su3_dble(gx); + else + (*gx)=unity; + + gx+=1; + } + + if (BNDRY>0) + { + pack_gbuf(); + send_gbuf(); + } +} + + +static void transform_ud(void) +{ + int ix,iy,t,ifc; + su3_dble *u; + + u=udfld(); + + for (ix=(VOLUME/2);ix(bs[mu]/2)) + svec[mu]-=bs[mu]; + } + + MPI_Bcast(svec,4,MPI_INT,0,MPI_COMM_WORLD); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,i,s[4]; + double phi[2],phi_prime[2]; + double d,dmax1,dmax2; + double A1,A2,a1,a2; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check4.log","w",stdout); + printf("\n"); + printf("Gauge and translation invariance of the Yang-Mills action\n"); + printf("---------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check4.c]", + "Syntax: check4 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + + g=amalloc(NSPIN*sizeof(*g),4); + + if (BNDRY>0) + gbuf=amalloc((BNDRY/2)*sizeof(*gbuf),4); + + error((g==NULL)||((BNDRY>0)&&(gbuf==NULL)),1,"main [check4.c]", + "Unable to allocate auxiliary arrays"); + + dmax1=0.0; + dmax2=0.0; + + for (i=0;i<8;i++) + { + random_ud(); + + A1=ym_action(); + random_vec(s); + if (bc!=3) + s[0]=0; + shift_ud(s); + A2=ym_action(); + + d=fabs(A1-A2)/A1; + if (d>dmax1) + dmax1=d; + + random_g(); + transform_ud(); + A2=ym_action(); + + d=fabs(A1-A2)/A1; + if (d>dmax2) + dmax2=d; + + a1=A1; + a2=A2; + + MPI_Bcast(&a1,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&a2,1,MPI_INT,0,MPI_COMM_WORLD); + + error((a1!=A1)||(a2!=A2),1,"main [check4.c]", + "Action is not globally the same"); + } + + error_chk(); + print_flags(); + + if (my_rank==0) + { + printf("Translation invariance = %.2e\n",dmax1); + printf("Gauge invariance = %.2e\n\n",dmax2); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check5.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check5.c new file mode 100644 index 0000000000000000000000000000000000000000..b5f1ec31ccb1790f16a66a9c013c1288054ead77 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check5.c @@ -0,0 +1,417 @@ + +/******************************************************************************* +* +* File check5.c +* +* Copyright (C) 2010, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Yang-Mills action of constant abelian background fields. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "su3fcts.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "tcharge.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,np[4],bo[4]; +static double mt[4][4],inp[4],twopi; +static su3_dble ud0={{0.0}}; + + +static double afld(int *x,int mu) +{ + int nu; + double xt[4],phi; + + xt[0]=(double)(safe_mod(x[0],N0)); + xt[1]=(double)(safe_mod(x[1],N1)); + xt[2]=(double)(safe_mod(x[2],N2)); + xt[3]=(double)(safe_mod(x[3],N3)); + + phi=0.0; + + for (nu=0;nu1) + { + MPI_Reduce(&aloc,&aall,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&aall,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + return aall; + } + else + return aloc; +} + + +static double Amt(void) +{ + int mu,nu; + double sm,pi; + double xl[4],phi,ft1,ft2,ft3,tr; + + xl[0]=(double)(NPROC0*L0); + xl[1]=(double)(NPROC1*L1); + xl[2]=(double)(NPROC2*L2); + xl[3]=(double)(NPROC3*L3); + + pi=4.0*atan(1.0); + sm=0.0; + + for (mu=1;mu<4;mu++) + { + for (nu=0;nu=(VOLUME/2)) + { + x[0]=bo[0]+x0; + x[1]=bo[1]+x1; + x[2]=bo[2]+x2; + x[3]=bo[3]+x3; + + u=udb+8*(ix-(VOLUME/2)); + + for (ifc=0;ifc<8;ifc++) + { + if (ifc&0x1) + x[ifc/2]-=1; + + phi=afld(x,ifc/2); + + if (ifc&0x1) + x[ifc/2]+=1; + + (*u)=ud0; + (*u).c11.re=cos(phi); + (*u).c11.im=sin(phi); + (*u).c22.re=(*u).c11.re; + (*u).c22.im=(*u).c11.im; + (*u).c33.re=cos(-2.0*phi); + (*u).c33.im=sin(-2.0*phi); + u+=1; + } + } + } + } + } + } + + set_bc(); + set_flags(UPDATED_UD); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,i; + double phi[2],phi_prime[2]; + double A1,A2,d,dmax; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check5.log","w",stdout); + printf("\n"); + printf("Yang-Mills action of constant abelian background fields\n"); + printf("-------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check5.c]", + "Syntax: check5 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.9012,1.2034,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123); + geometry(); + + twopi=8.0*atan(1.0); + + np[0]=N0; + np[1]=N1; + np[2]=N2; + np[3]=N3; + + bo[0]=cpr[0]*L0; + bo[1]=cpr[1]*L1; + bo[2]=cpr[2]*L2; + bo[3]=cpr[3]*L3; + + inp[0]=1.0/(double)(np[0]); + inp[1]=1.0/(double)(np[1]); + inp[2]=1.0/(double)(np[2]); + inp[3]=1.0/(double)(np[3]); + + dmax=0.0; + + for (i=0;i<10;i++) + { + choose_mt(); + set_ud(); + + A1=Amt(); + A2=ym_action(); + + if (my_rank==0) + printf("Field no = %2d, A1 = %12.6e, A2 = %12.6e\n",i+1,A1,A2); + + d=fabs(A1-A2)/A1; + if (d>dmax) + dmax=d; + } + + error_chk(); + + if (my_rank==0) + { + printf("\n"); + printf("Maximal relative deviation = %.1e\n\n",dmax); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check6.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check6.c new file mode 100644 index 0000000000000000000000000000000000000000..17731ff6b04e175fa1173ce08ed381d28ba51113 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check6.c @@ -0,0 +1,146 @@ + +/******************************************************************************* +* +* File check6.c +* +* Copyright (C) 2010, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the program ym_action_slices(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "wflow.h" +#include "tcharge.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,n,dn; +static double eps,A1,A2,A[N0],A0[N0]; + + +int main(int argc,char *argv[]) +{ + int my_rank,i,imax,t; + double phi[2],phi_prime[2]; + double dev; + FILE *fin=NULL,*flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check6.log","w",stdout); + fin=freopen("check6.in","r",stdin); + + printf("\n"); + printf("Check of the program ym_action_slices()\n"); + printf("---------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("n","%d\n",&n); + read_line("dn","%d\n",&dn); + read_line("eps","%lf",&eps); + fclose(fin); + + printf("n = %d\n",n); + printf("dn = %d\n",dn); + printf("eps = %.2e\n\n",eps); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check6.c]", + "Syntax: check6 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&n,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&dn,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&eps,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.9012,1.2034,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + alloc_wfd(2); + + random_ud(); + imax=n/dn; + + for (i=0;i that allows the type of boundary +condition to be chosen. When the option is not set, open boundary conditions +are assumed. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..88ddadd5cfe62e977a818793ea88e32e8e36e726 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/Makefile @@ -0,0 +1,125 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 check4 check5 + +FLAGS = flags lat_parms dfl_parms + +LATTICE = bcnds uidx geometry + +RANDOM = ranlux ranlxs ranlxd gauss + +UFLDS = plaq_sum shift uflds udcom bstap + +SU3FCTS = su3prod su3ren cm3x3 random_su3 + +UTILS = endian mutils utils wspace + +MODULES = $(FLAGS) $(LATTICE) $(RANDOM) $(UFLDS) $(SU3FCTS) $(UTILS) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/random:$(MDIR)/uflds:\ + $(MDIR)/su3fcts:$(MDIR)/utils + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..d97b092c45d43122b20046443ced57fc69c91f4c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check1.c @@ -0,0 +1,505 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2009, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Initialization of the link variables. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static complex_dble det_dble(su3_dble *u) +{ + complex_dble det1,det2,det3,detu; + + det1.re= + ((*u).c22.re*(*u).c33.re-(*u).c22.im*(*u).c33.im)- + ((*u).c23.re*(*u).c32.re-(*u).c23.im*(*u).c32.im); + det1.im= + ((*u).c22.re*(*u).c33.im+(*u).c22.im*(*u).c33.re)- + ((*u).c23.re*(*u).c32.im+(*u).c23.im*(*u).c32.re); + det2.re= + ((*u).c21.re*(*u).c33.re-(*u).c21.im*(*u).c33.im)- + ((*u).c23.re*(*u).c31.re-(*u).c23.im*(*u).c31.im); + det2.im= + ((*u).c21.re*(*u).c33.im+(*u).c21.im*(*u).c33.re)- + ((*u).c23.re*(*u).c31.im+(*u).c23.im*(*u).c31.re); + det3.re= + ((*u).c21.re*(*u).c32.re-(*u).c21.im*(*u).c32.im)- + ((*u).c22.re*(*u).c31.re-(*u).c22.im*(*u).c31.im); + det3.im= + ((*u).c21.re*(*u).c32.im+(*u).c21.im*(*u).c32.re)- + ((*u).c22.re*(*u).c31.im+(*u).c22.im*(*u).c31.re); + + detu.re= + ((*u).c11.re*det1.re-(*u).c11.im*det1.im)- + ((*u).c12.re*det2.re-(*u).c12.im*det2.im)+ + ((*u).c13.re*det3.re-(*u).c13.im*det3.im); + detu.im= + ((*u).c11.re*det1.im+(*u).c11.im*det1.re)- + ((*u).c12.re*det2.im+(*u).c12.im*det2.re)+ + ((*u).c13.re*det3.im+(*u).c13.im*det3.re); + + return detu; +} + + +static double dev_unity(su3 *u) +{ + int i; + float r[18]; + double d,dmax; + + r[ 0]=(*u).c11.re-1.0f; + r[ 1]=(*u).c11.im; + r[ 2]=(*u).c12.re; + r[ 3]=(*u).c12.im; + r[ 4]=(*u).c13.re; + r[ 5]=(*u).c13.im; + + r[ 6]=(*u).c21.re; + r[ 7]=(*u).c21.im; + r[ 8]=(*u).c22.re-1.0f; + r[ 9]=(*u).c22.im; + r[10]=(*u).c23.re; + r[11]=(*u).c23.im; + + r[12]=(*u).c31.re; + r[13]=(*u).c31.im; + r[14]=(*u).c32.re; + r[15]=(*u).c32.im; + r[16]=(*u).c33.re-1.0f; + r[17]=(*u).c33.im; + + dmax=0.0; + + for (i=0;i<18;i++) + { + d=fabs((double)(r[i])); + if (d>dmax) + dmax=d; + } + + return dmax; +} + + +static double dev_unity_dble(su3_dble *u) +{ + int i; + double r[18],d,dmax; + + r[ 0]=(*u).c11.re-1.0; + r[ 1]=(*u).c11.im; + r[ 2]=(*u).c12.re; + r[ 3]=(*u).c12.im; + r[ 4]=(*u).c13.re; + r[ 5]=(*u).c13.im; + + r[ 6]=(*u).c21.re; + r[ 7]=(*u).c21.im; + r[ 8]=(*u).c22.re-1.0; + r[ 9]=(*u).c22.im; + r[10]=(*u).c23.re; + r[11]=(*u).c23.im; + + r[12]=(*u).c31.re; + r[13]=(*u).c31.im; + r[14]=(*u).c32.re; + r[15]=(*u).c32.im; + r[16]=(*u).c33.re-1.0; + r[17]=(*u).c33.im; + + dmax=0.0; + + for (i=0;i<18;i++) + { + d=fabs(r[i]); + if (d>dmax) + dmax=d; + } + + return dmax; +} + + +static double dev_zero_dble(su3_dble *u) +{ + int i; + double r[18],d,dmax; + + r[ 0]=(*u).c11.re; + r[ 1]=(*u).c11.im; + r[ 2]=(*u).c12.re; + r[ 3]=(*u).c12.im; + r[ 4]=(*u).c13.re; + r[ 5]=(*u).c13.im; + + r[ 6]=(*u).c21.re; + r[ 7]=(*u).c21.im; + r[ 8]=(*u).c22.re; + r[ 9]=(*u).c22.im; + r[10]=(*u).c23.re; + r[11]=(*u).c23.im; + + r[12]=(*u).c31.re; + r[13]=(*u).c31.im; + r[14]=(*u).c32.re; + r[15]=(*u).c32.im; + r[16]=(*u).c33.re; + r[17]=(*u).c33.im; + + dmax=0.0; + + for (i=0;i<18;i++) + { + d=fabs(r[i]); + if (d>dmax) + dmax=d; + } + + return dmax; +} + + +static double dev_bval_dble(int k,double *phi,su3_dble *u) +{ + int i; + double r[18],s[3],phi3,d,dmax; + + s[0]=(double)(N1); + s[1]=(double)(N2); + s[2]=(double)(N3); + phi3=-phi[0]-phi[1]; + + r[ 0]=(*u).c11.re-cos(phi[0]/s[k-1]); + r[ 1]=(*u).c11.im-sin(phi[0]/s[k-1]); + r[ 2]=(*u).c12.re; + r[ 3]=(*u).c12.im; + r[ 4]=(*u).c13.re; + r[ 5]=(*u).c13.im; + + r[ 6]=(*u).c21.re; + r[ 7]=(*u).c21.im; + r[ 8]=(*u).c22.re-cos(phi[1]/s[k-1]); + r[ 9]=(*u).c22.im-sin(phi[1]/s[k-1]); + r[10]=(*u).c23.re; + r[11]=(*u).c23.im; + + r[12]=(*u).c31.re; + r[13]=(*u).c31.im; + r[14]=(*u).c32.re; + r[15]=(*u).c32.im; + r[16]=(*u).c33.re-cos(phi3/s[k-1]); + r[17]=(*u).c33.im-sin(phi3/s[k-1]); + + dmax=0.0; + + for (i=0;i<18;i++) + { + d=fabs(r[i]); + if (d>dmax) + dmax=d; + } + + return dmax; +} + + +static double dev_uudag_dble(su3_dble *u) +{ + su3_dble udag,w; + + _su3_dagger(udag,(*u)); + _su3_times_su3(w,(*u),udag); + + return dev_unity_dble(&w); +} + + +static double dev_detu_dble(su3_dble *u) +{ + double d,dmax; + complex_dble detu; + + detu=det_dble(u); + dmax=0.0; + + d=fabs(1.0-detu.re); + if (d>dmax) + dmax=d; + d=fabs(detu.im); + if (d>dmax) + dmax=d; + + return dmax; +} + + +static double dev_udu_dble(su3_dble *ud,su3 *u) +{ + int i; + double r[18],d,dmax; + + r[ 0]=(*ud).c11.re-(double)((*u).c11.re); + r[ 1]=(*ud).c11.im-(double)((*u).c11.im); + r[ 2]=(*ud).c12.re-(double)((*u).c12.re); + r[ 3]=(*ud).c12.im-(double)((*u).c12.im); + r[ 4]=(*ud).c13.re-(double)((*u).c13.re); + r[ 5]=(*ud).c13.im-(double)((*u).c13.im); + r[ 6]=(*ud).c21.re-(double)((*u).c21.re); + r[ 7]=(*ud).c21.im-(double)((*u).c21.im); + r[ 8]=(*ud).c22.re-(double)((*u).c22.re); + r[ 9]=(*ud).c22.im-(double)((*u).c22.im); + r[10]=(*ud).c23.re-(double)((*u).c23.re); + r[11]=(*ud).c23.im-(double)((*u).c23.im); + r[12]=(*ud).c31.re-(double)((*u).c31.re); + r[13]=(*ud).c31.im-(double)((*u).c31.im); + r[14]=(*ud).c32.re-(double)((*u).c32.re); + r[15]=(*ud).c32.im-(double)((*u).c32.im); + r[16]=(*ud).c33.re-(double)((*u).c33.re); + r[17]=(*ud).c33.im-(double)((*u).c33.im); + + dmax=0.0; + + for (i=0;i<18;i++) + { + d=fabs(r[i]); + if (d>dmax) + dmax=d; + } + + return dmax; +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc; + int iu,ix,ifc,x0,k,ie; + double d1,d2,dmax1,dmax2; + double dmax1_all,dmax2_all; + double phi[2],phi_prime[2]; + su3 *u,*ub,*um; + su3_dble *ud,*udb,*udm; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + + printf("\n"); + printf("Initialization of the link variables\n"); + printf("------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check1.c]", + "Syntax: check1 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + + ub=ufld(); + um=ub+4*VOLUME; + dmax1=0.0; + + for (u=ub;udmax1) + dmax1=d1; + } + + MPI_Reduce(&dmax1,&dmax1_all,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Allocate single-precision gauge field\n"); + printf("|u-1| = %.2e\n\n",dmax1_all); + } + + print_flags(); + + udb=udfld(); + + ie=check_bc(0.0); + error_root(ie==0,1,"main [check1.c]","Boundary conditions not properly set"); + + udm=udb+4*VOLUME; + dmax1=0.0; + dmax2=0.0; + + for (ud=udb;uddmax2) + dmax2=d2; + } + else if ((bc!=1)||(x0>0)||(ifc<2)) + { + d1=dev_unity_dble(ud); + if (d1>dmax1) + dmax1=d1; + } + else + { + d2=dev_bval_dble(ifc/2,phi,ud); + if (d2>dmax2) + dmax2=d2; + } + } + + if ((cpr[0]==(NPROC0-1))&&((bc==1)||(bc==2))) + { + ud=udb+4*VOLUME+7*(BNDRY/4); + + for (k=1;k<4;k++) + { + d2=dev_bval_dble(k,phi_prime,ud); + ud+=1; + + if (d2>dmax2) + dmax2=d2; + } + } + + MPI_Reduce(&dmax1,&dmax1_all,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Reduce(&dmax2,&dmax2_all,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Allocate double-precision gauge field\n"); + printf("|ud-1| = %.2e\n",dmax1_all); + if (bc!=3) + printf("|ud-bval| = %.2e\n",dmax2_all); + printf("\n"); + } + + print_flags(); + + random_ud(); + assign_ud2u(); + + ie=check_bc(0.0); + error_root(ie==0,1,"main [check1.c]","Boundary conditions changed"); + + ud=udb; + udm=udb+4*VOLUME; + u=ub; + dmax1=0.0; + + for (ud=udb;uddmax1) + dmax1=d1; + + u+=1; + } + + MPI_Reduce(&dmax1,&dmax1_all,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Random fields\n"); + printf("Assign double-precision to single-precision field\n"); + printf("Maximal deviation = %.2e\n\n",dmax1_all); + } + + print_flags(); + + random_ud(); + dmax1=0.0; + dmax2=0.0; + + for (ud=udb;uddmax1) + dmax1=d1; + if (d2>dmax2) + dmax2=d2; + } + + MPI_Reduce(&dmax1,&dmax1_all,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Reduce(&dmax2,&dmax2_all,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Call random_ud\n"); + printf("|u^dag*u-1| = %.2e\n",dmax1_all); + printf("|det{u}-1| = %.2e\n\n",dmax2_all); + } + + print_flags(); + + if (my_rank==0) + fclose(flog); + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..039ae76be3b000ec8bc429d63aa2abeb0529b591 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check2.c @@ -0,0 +1,396 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2010, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Renormalization of the link variables. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +static int bc; + + +static int is_zero_dble(su3_dble *ud) +{ + int i; + double r[18]; + + r[ 0]=(*ud).c11.re; + r[ 1]=(*ud).c11.im; + r[ 2]=(*ud).c12.re; + r[ 3]=(*ud).c12.im; + r[ 4]=(*ud).c13.re; + r[ 5]=(*ud).c13.im; + + r[ 6]=(*ud).c21.re; + r[ 7]=(*ud).c21.im; + r[ 8]=(*ud).c22.re; + r[ 9]=(*ud).c22.im; + r[10]=(*ud).c23.re; + r[11]=(*ud).c23.im; + + r[12]=(*ud).c31.re; + r[13]=(*ud).c31.im; + r[14]=(*ud).c32.re; + r[15]=(*ud).c32.im; + r[16]=(*ud).c33.re; + r[17]=(*ud).c33.im; + + for (i=0;i<18;i++) + { + if (r[i]!=0.0) + return 0; + } + + return 1; +} + + +static complex_dble det_dble(su3_dble *u) +{ + complex_dble det1,det2,det3,detu; + + det1.re= + ((*u).c22.re*(*u).c33.re-(*u).c22.im*(*u).c33.im)- + ((*u).c23.re*(*u).c32.re-(*u).c23.im*(*u).c32.im); + det1.im= + ((*u).c22.re*(*u).c33.im+(*u).c22.im*(*u).c33.re)- + ((*u).c23.re*(*u).c32.im+(*u).c23.im*(*u).c32.re); + det2.re= + ((*u).c21.re*(*u).c33.re-(*u).c21.im*(*u).c33.im)- + ((*u).c23.re*(*u).c31.re-(*u).c23.im*(*u).c31.im); + det2.im= + ((*u).c21.re*(*u).c33.im+(*u).c21.im*(*u).c33.re)- + ((*u).c23.re*(*u).c31.im+(*u).c23.im*(*u).c31.re); + det3.re= + ((*u).c21.re*(*u).c32.re-(*u).c21.im*(*u).c32.im)- + ((*u).c22.re*(*u).c31.re-(*u).c22.im*(*u).c31.im); + det3.im= + ((*u).c21.re*(*u).c32.im+(*u).c21.im*(*u).c32.re)- + ((*u).c22.re*(*u).c31.im+(*u).c22.im*(*u).c31.re); + + detu.re= + ((*u).c11.re*det1.re-(*u).c11.im*det1.im)- + ((*u).c12.re*det2.re-(*u).c12.im*det2.im)+ + ((*u).c13.re*det3.re-(*u).c13.im*det3.im); + detu.im= + ((*u).c11.re*det1.im+(*u).c11.im*det1.re)- + ((*u).c12.re*det2.im+(*u).c12.im*det2.re)+ + ((*u).c13.re*det3.im+(*u).c13.im*det3.re); + + return detu; +} + + +static double dev_uudag_dble(su3_dble *u,su3_dble *v) +{ + int i; + double r[18],d,dmax; + su3_dble vdag,w; + + _su3_dagger(vdag,(*v)); + _su3_times_su3(w,(*u),vdag); + + w.c11.re-=1.0; + w.c22.re-=1.0; + w.c33.re-=1.0; + + r[ 0]=w.c11.re; + r[ 1]=w.c11.im; + r[ 2]=w.c12.re; + r[ 3]=w.c12.im; + r[ 4]=w.c13.re; + r[ 5]=w.c13.im; + + r[ 6]=w.c21.re; + r[ 7]=w.c21.im; + r[ 8]=w.c22.re; + r[ 9]=w.c22.im; + r[10]=w.c23.re; + r[11]=w.c23.im; + + r[12]=w.c31.re; + r[13]=w.c31.im; + r[14]=w.c32.re; + r[15]=w.c32.im; + r[16]=w.c33.re; + r[17]=w.c33.im; + + dmax=0.0; + + for (i=0;i<18;i++) + { + d=fabs(r[i]); + if (d>dmax) + dmax=d; + } + + return dmax; +} + + +static double dev_detu_dble(su3_dble *u) +{ + double d,dmax; + complex_dble detu; + + detu=det_dble(u); + dmax=0.0; + + d=fabs(1.0-detu.re); + if (d>dmax) + dmax=d; + d=fabs(detu.im); + if (d>dmax) + dmax=d; + + return dmax; +} + + +static void check_ud(double *dev1,double *dev2) +{ + int iu,ix,ifc,x0; + double d1,d2,dmax1,dmax2; + su3_dble *u,*ub,*um; + + ub=udfld(); + um=ub+4*VOLUME; + dmax1=0.0; + dmax2=0.0; + + for (u=ub;udmax1) + dmax1=d1; + if (d2>dmax2) + dmax2=d2; + } + + MPI_Reduce(&dmax1,dev1,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Reduce(&dmax2,dev2,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(dev1,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(dev2,1,MPI_DOUBLE,0,MPI_COMM_WORLD); +} + + +static double cmp_ud(su3_dble *usv) +{ + int ix,ifc,x0; + double d1,dmax1; + su3_dble *ub,*u,*v,*um; + + ub=udfld(); + um=ub+4*VOLUME; + v=usv; + dmax1=0.0; + + for (u=ub;udmax1) + dmax1=d1; + + v+=1; + } + + error_chk(); + + d1=dmax1; + MPI_Reduce(&d1,&dmax1,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax1,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return dmax1; +} + + +static void tilt_ud(double eps) +{ + int ix,ifc,t; + double r[18]; + su3_dble *ud; + + ud=udfld(); + + for (ix=(VOLUME/2);ix0)))|| + ((ifc>=2)&&((bc!=1)||(t>0)))) + { + gauss_dble(r,18); + + (*ud).c11.re+=eps*r[ 0]; + (*ud).c11.im+=eps*r[ 1]; + (*ud).c12.re+=eps*r[ 2]; + (*ud).c12.im+=eps*r[ 3]; + (*ud).c13.re+=eps*r[ 4]; + (*ud).c13.im+=eps*r[ 5]; + + (*ud).c21.re+=eps*r[ 6]; + (*ud).c21.im+=eps*r[ 7]; + (*ud).c22.re+=eps*r[ 8]; + (*ud).c22.im+=eps*r[ 9]; + (*ud).c23.re+=eps*r[10]; + (*ud).c23.im+=eps*r[11]; + + (*ud).c31.re+=eps*r[12]; + (*ud).c31.im+=eps*r[13]; + (*ud).c32.re+=eps*r[14]; + (*ud).c32.im+=eps*r[15]; + (*ud).c33.re+=eps*r[16]; + (*ud).c33.im+=eps*r[17]; + } + + ud+=1; + } + } +} + + +int main(int argc,char *argv[]) +{ + int my_rank,ie; + double d1,d2,d3,d4,d5; + double phi[2],phi_prime[2]; + su3_dble *udb,**usv; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + + printf("\n"); + printf("Renormalization of the link variables\n"); + printf("-------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check2.c]", + "Syntax: check2 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + alloc_wud(1); + usv=reserve_wud(1); + udb=udfld(); + + random_ud(); + check_ud(&d1,&d2); + + if (my_rank==0) + { + printf("Random double-precision gauge field:\n"); + printf("|u^dag*u-1| = %.2e\n",d1); + printf("|det{u}-1| = %.2e\n\n",d2); + } + + cm3x3_assign(4*VOLUME,udb,usv[0]); + tilt_ud(50.0*DBL_EPSILON); + check_ud(&d1,&d2); + renormalize_ud(); + d3=cmp_ud(usv[0]); + check_ud(&d4,&d5); + + ie=check_bc(0.0); + error_root(ie==0,1,"main [check2.c]","Boundary conditions changed"); + + if (my_rank==0) + { + printf("Tilt double-precision gauge field:\n"); + printf("|u^dag*u-1| = %.2e\n",d1); + printf("|det{u}-1| = %.2e\n\n",d2); + + printf("After renormalization:\n"); + printf("|u^dag*u_old-1| = %.2e\n",d3); + printf("|u^dag*u-1| = %.2e\n",d4); + printf("|det{u}-1| = %.2e\n\n",d5); + } + + random_ud(); + cm3x3_assign(4*VOLUME,udb,usv[0]); + renormalize_ud(); + d1=cmp_ud(usv[0]); + + ie=check_bc(0.0); + error_root(ie==0,1,"main [check2.c]","Boundary conditions changed"); + + if (my_rank==0) + { + printf("Renormalization of a fresh random double-precision field:\n"); + printf("Maximal change in the link variables = %.2e\n\n",d1); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..8ceb1a136b408675e4a638a96e749cacc95122be --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check3.c @@ -0,0 +1,448 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2005, 2007, 2011, 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the program that translates the double-precision gauge field. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "global.h" + +static int my_rank,ipsnd,iprcv,*isnd; +static su3_dble *uold,*unew,*ubuf; + + +static void alloc_bufs(void) +{ + isnd=amalloc(NPROC*sizeof(*isnd),3); + uold=amalloc(12*VOLUME*sizeof(*uold),ALIGN); + error((isnd==NULL)||(uold==NULL),1,"alloc_bufs [check3.c]", + "Unable to allocate auxiliary arrays"); + + unew=uold+4*VOLUME; + ubuf=unew+4*VOLUME; +} + + +static int range(int *dist,int *s,int *ra,int *rb) +{ + int io,l[4],nl[4]; + int mu,a,b; + + io=1; + + l[0]=L0; + l[1]=L1; + l[2]=L2; + l[3]=L3; + + nl[0]=L0*NPROC0; + nl[1]=L1*NPROC1; + nl[2]=L2*NPROC2; + nl[3]=L3*NPROC3; + + for (mu=0;mu<4;mu++) + { + a=dist[mu]+s[mu]; + b=a+l[mu]; + + a=safe_mod(a,nl[mu]); + b=safe_mod(b,nl[mu]); + + if (a==b) + { + ra[mu]=0; + rb[mu]=l[mu]; + } + else if (a1) + { + io=itest; + MPI_Reduce(&io,&itest,1,MPI_INT,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&itest,1,MPI_INT,0,MPI_COMM_WORLD); + } + + return itest; +} + + +static void random_vec(int *svec) +{ + int mu,bs[4]; + double r[4]; + + bs[0]=NPROC0*L0; + bs[1]=NPROC1*L1; + bs[2]=NPROC2*L2; + bs[3]=NPROC3*L3; + + ranlxd(r,4); + + for (mu=0;mu<4;mu++) + { + svec[mu]=(int)((double)(bs[mu])*r[mu]); + if (svec[mu]>(bs[mu]/2)) + svec[mu]-=bs[mu]; + } + + MPI_Bcast(svec,4,MPI_INT,0,MPI_COMM_WORLD); +} + + +int main(int argc,char *argv[]) +{ + int bc,ie; + int ifc,mu,s[4],n,itest; + double phi[2],phi_prime[2]; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check3.log","w",stdout); + + printf("\n"); + printf("Translation of the double-precision gauge field\n"); + printf("-----------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check3.c]", + "Syntax: check3 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + geometry(); + alloc_bufs(); + + if (my_rank==0) + printf("Elementary shift vectors:\n\n"); + + for (ifc=0;ifc<8;ifc++) + { + if ((ifc>1)||(bc==3)) + { + random_ud(); + save_field(uold); + + s[0]=0; + s[1]=0; + s[2]=0; + s[3]=0; + mu=ifc/2; + + if ((ifc&0x1)==0) + s[mu]=1; + else + s[mu]=-1; + + shift_ud(s); + save_field(unew); + itest=cmp_field(s); + + ie=check_bc(0.0); + error_root(ie==0,1,"main [check3.c]","Boundary conditions changed"); + + if (my_rank==0) + { + printf("Shift vector (% 3d,% 3d,% 3d,% 3d): ", + s[0],s[1],s[2],s[3]); + + if (itest==0) + printf("ok\n"); + else + printf("failed\n"); + } + } + } + + if (my_rank==0) + { + printf("\n"); + printf("Random shift vectors:\n\n"); + } + + for (n=0;n<8;n++) + { + random_ud(); + save_field(uold); + + random_vec(s); + if (bc!=3) + s[0]=0; + shift_ud(s); + save_field(unew); + itest=cmp_field(s); + + ie=check_bc(0.0); + error_root(ie==0,1,"main [check3.c]","Boundary conditions changed"); + + if (my_rank==0) + { + printf("Shift vector (% 3d,% 3d,% 3d,% 3d): ", + s[0],s[1],s[2],s[3]); + + if (itest==0) + printf("ok\n"); + else + printf("failed\n"); + } + } + + error_chk(); + + if (my_rank==0) + { + printf("\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check4.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check4.c new file mode 100644 index 0000000000000000000000000000000000000000..46f1b451b6a0c75d0648961525623c03fc360f8a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check4.c @@ -0,0 +1,491 @@ + +/******************************************************************************* +* +* File check4.c +* +* Copyright (C) 2005, 2007-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the programs for the plaquette sums of the double-precision +* gauge field. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,nfc[8],ofs[8]; +static double asl1[N0],asl2[N0]; +static const su3_dble ud0={{0.0}}; +static su3_dble *g,*gbuf; +static su3_dble wd ALIGNED16; + + +static void pack_gbuf(void) +{ + int ifc,ib,ix; + + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=0; + ofs[1]=ofs[0]+nfc[0]; + ofs[2]=ofs[1]+nfc[1]; + ofs[3]=ofs[2]+nfc[2]; + ofs[4]=ofs[3]+nfc[3]; + ofs[5]=ofs[4]+nfc[4]; + ofs[6]=ofs[5]+nfc[5]; + ofs[7]=ofs[6]+nfc[6]; + + for (ifc=0;ifc<8;ifc++) + { + for (ib=0;ib0) + { + tag=mpi_tag(); + saddr=npr[ifc^0x1]; + raddr=npr[ifc]; + sbuf=gbuf+ofs[ifc]; + rbuf=g+VOLUME+ofs[ifc]; + + if (np&0x1) + { + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + } + } +} + + +static void random_g(void) +{ + int ix,t; + su3_dble unity,*gx; + + unity=ud0; + unity.c11.re=1.0; + unity.c22.re=1.0; + unity.c33.re=1.0; + gx=g; + + for (ix=0;ix0)||(bc!=1)) + random_su3_dble(gx); + else + (*gx)=unity; + + gx+=1; + } + + if (BNDRY>0) + { + pack_gbuf(); + send_gbuf(); + } +} + + +static void transform_ud(void) +{ + int ix,iy,t,ifc; + su3_dble *u; + + u=udfld(); + + for (ix=(VOLUME/2);ix(bs[mu]/2)) + svec[mu]-=bs[mu]; + } + + MPI_Bcast(svec,4,MPI_INT,0,MPI_COMM_WORLD); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,n,t,s[4]; + double phi[2],phi_prime[2],act1; + double nplaq1,nplaq2,p1,p2; + double d1,d2,d3; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check4.log","w",stdout); + + printf("\n"); + printf("Plaquette sums of the double-precision gauge field\n"); + printf("--------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check4.c]", + "Syntax: check4 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + + g=amalloc(NSPIN*sizeof(*g),4); + + if (BNDRY>0) + gbuf=amalloc((BNDRY/2)*sizeof(*gbuf),4); + + error((g==NULL)||((BNDRY>0)&&(gbuf==NULL)),1,"main [check4.c]", + "Unable to allocate auxiliary arrays"); + + p1=plaq_sum_dble(1); + p2=plaq_wsum_dble(1); + + if (bc==0) + { + nplaq1=(double)((6*N0-3)*N1)*(double)(N2*N3); + nplaq2=(double)((6*N0-6)*N1)*(double)(N2*N3); + } + else if (bc==3) + { + nplaq1=(double)(6*N0*N1)*(double)(N2*N3); + nplaq2=nplaq1; + } + else + { + nplaq1=(double)((6*N0+3)*N1)*(double)(N2*N3); + nplaq2=(double)(6*N0*N1)*(double)(N2*N3); + } + + d1=0.0; + d2=0.0; + + if (bc==1) + { + d1=cos(phi[0]/(double)(N1))+ + cos(phi[1]/(double)(N1))+ + cos((phi[0]+phi[1])/(double)(N1))+ + cos(phi[0]/(double)(N2))+ + cos(phi[1]/(double)(N2))+ + cos((phi[0]+phi[1])/(double)(N2))+ + cos(phi[0]/(double)(N3))+ + cos(phi[1]/(double)(N3))+ + cos((phi[0]+phi[1])/(double)(N3)); + + d1=(d1-9.0)*(double)(N1*N2*N3); + } + + if ((bc==1)||(bc==2)) + { + d2=cos(phi_prime[0]/(double)(N1))+ + cos(phi_prime[1]/(double)(N1))+ + cos((phi_prime[0]+phi_prime[1])/(double)(N1))+ + cos(phi_prime[0]/(double)(N2))+ + cos(phi_prime[1]/(double)(N2))+ + cos((phi_prime[0]+phi_prime[1])/(double)(N2))+ + cos(phi_prime[0]/(double)(N3))+ + cos(phi_prime[1]/(double)(N3))+ + cos((phi_prime[0]+phi_prime[1])/(double)(N3)); + + d2=(d2-9.0)*(double)(N1*N2*N3); + } + + if (my_rank==0) + { + printf("After field initialization:\n"); + printf("Deviation from expected value (plaq_sum) = %.1e\n", + fabs(1.0-p1/(3.0*nplaq1+d1+d2))); + printf("Deviation from expected value (plaq_wsum) = %.1e\n\n", + fabs(1.0-p2/(3.0*nplaq2+d1+d2))); + } + + print_flags(); + random_ud(); + + p1=plaq_sum_dble(1); + p2=plaq_wsum_dble(1); + act1=plaq_action_slices(asl1); + d1=act1; + + if ((bc==0)||(bc==3)) + { + for (t=0;t +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "global.h" + +static const int plns[6][2]={{0,1},{0,2},{0,3},{2,3},{3,1},{1,2}}; +static int bc,nfc[8],ofs[8],hofs[8]; +static double psum0[8],psum1[8]; +static su3_dble *udb,*hdb; +static su3_dble wd1,wd2 ALIGNED16; + + +static void set_ofs(void) +{ + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=0; + ofs[1]=ofs[0]+(FACE0/2); + ofs[2]=ofs[1]+(FACE0/2); + ofs[3]=ofs[2]+(FACE1/2); + ofs[4]=ofs[3]+(FACE1/2); + ofs[5]=ofs[4]+(FACE2/2); + ofs[6]=ofs[5]+(FACE2/2); + ofs[7]=ofs[6]+(FACE3/2); + + hofs[0]=0; + hofs[1]=hofs[0]+3*FACE0; + hofs[2]=hofs[1]+3*FACE0; + hofs[3]=hofs[2]+3*FACE1; + hofs[4]=hofs[3]+3*FACE1; + hofs[5]=hofs[4]+3*FACE2; + hofs[6]=hofs[5]+3*FACE2; + hofs[7]=hofs[6]+3*FACE3; +} + + +static double plaq0(int n,int ix) +{ + int ip[4]; + double sm; + + plaq_uidx(n,ix,ip); + + su3xsu3(udb+ip[0],udb+ip[1],&wd1); + su3dagxsu3dag(udb+ip[3],udb+ip[2],&wd2); + cm3x3_retr(&wd1,&wd2,&sm); + + return sm; +} + + +static double plaq1(int iu,int ih) +{ + su3xsu3dag(udb+iu,hdb+ih,&wd1); + + return wd1.c11.re+wd1.c22.re+wd1.c33.re; +} + + +static void set_psum0(void) +{ + int ifc,n,ix,mu,nu; + + for (ifc=0;ifc<8;ifc++) + psum0[ifc]=0.0; + + for (ix=0;ix=VOLUME) + psum0[2*mu+1]+=plaq0(n,ix); + + if (iup[ix][nu]>=VOLUME) + psum0[2*nu+1]+=plaq0(n,ix); + + if (idn[ix][mu]>=VOLUME) + psum0[2*mu]+=plaq0(n,ix); + + if (idn[ix][nu]>=VOLUME) + psum0[2*nu]+=plaq0(n,ix); + } + } +} + + +static void set_psum1(void) +{ + int ifc,n,ix,mu,nu,ip[4]; + int iy,ib,iu,ih; + + for (ifc=0;ifc<8;ifc++) + psum1[ifc]=0.0; + + for (ix=0;ix=VOLUME) + { + plaq_uidx(n,ix,ip); + iu=ip[1]; + + ifc=2*mu+1; + iy=iup[ix][mu]-VOLUME; + + if (iy<(BNDRY/2)) + ib=iy-ofs[ifc]; + else + ib=iy-ofs[ifc]-(BNDRY/2)+nfc[ifc]; + + ih=hofs[ifc]+3*ib+nu-(nu>mu); + + psum1[ifc]+=plaq1(iu,ih); + } + + if (iup[ix][nu]>=VOLUME) + { + plaq_uidx(n,ix,ip); + iu=ip[3]; + + ifc=2*nu+1; + iy=iup[ix][nu]-VOLUME; + + if (iy<(BNDRY/2)) + ib=iy-ofs[ifc]; + else + ib=iy-ofs[ifc]-(BNDRY/2)+nfc[ifc]; + + ih=hofs[ifc]+3*ib+mu-(mu>nu); + + psum1[ifc]+=plaq1(iu,ih); + } + + if (idn[ix][mu]>=VOLUME) + { + plaq_uidx(n,ix,ip); + iu=ip[2]; + + ifc=2*mu; + iy=idn[ix][mu]-VOLUME; + + if (iy<(BNDRY/2)) + ib=iy-ofs[ifc]; + else + ib=iy-ofs[ifc]-(BNDRY/2)+nfc[ifc]; + + ih=hofs[ifc]+3*ib+nu-(nu>mu); + + psum1[ifc]+=plaq1(iu,ih); + } + + if (idn[ix][nu]>=VOLUME) + { + plaq_uidx(n,ix,ip); + iu=ip[0]; + + ifc=2*nu; + iy=idn[ix][nu]-VOLUME; + + if (iy<(BNDRY/2)) + ib=iy-ofs[ifc]; + else + ib=iy-ofs[ifc]-(BNDRY/2)+nfc[ifc]; + + ih=hofs[ifc]+3*ib+mu-(mu>nu); + + psum1[ifc]+=plaq1(iu,ih); + } + } + } +} + + +static void check_psums(void) +{ + int ifc,np; + int saddr,raddr,nbf,tag; + double sbuf,rbuf,dmy[8]; + MPI_Status stat; + + np=(cpr[0]+cpr[1]+cpr[2]+cpr[3])&0x1; + + for (ifc=0;ifc<8;ifc++) + { + if (nfc[ifc]>0) + { + saddr=npr[ifc]; + raddr=npr[ifc^0x1]; + sbuf=psum0[ifc]; + nbf=1; + tag=mpi_tag(); + + if (np==0) + { + MPI_Send(&sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(&rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(&rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(&sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + + if ((bc!=3)&& + (((cpr[0]==0)&&(ifc==1))||((cpr[0]==(NPROC0-1))&&(ifc==0)))) + psum1[ifc^0x1]=0.0; + else + psum1[ifc^0x1]-=rbuf; + } + } + + for (ifc=0;ifc<8;ifc++) + dmy[ifc]=fabs(psum0[ifc]); + + MPI_Reduce(dmy,psum0,8,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(psum0,8,MPI_DOUBLE,0,MPI_COMM_WORLD); + + for (ifc=0;ifc<8;ifc++) + dmy[ifc]=fabs(psum1[ifc]); + + MPI_Reduce(dmy,psum1,8,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(psum1,8,MPI_DOUBLE,0,MPI_COMM_WORLD); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,ifc,ie; + double phi[2],phi_prime[2]; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check5.log","w",stdout); + + printf("\n"); + printf("Check of the program set_bstap()\n"); + printf("--------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check5.c]", + "Syntax: check5 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,89103); + geometry(); + + print_flags(); + + random_ud(); + set_bstap(); + + print_flags(); + + udb=udfld(); + hdb=bstap(); + set_ofs(); + set_psum0(); + set_psum1(); + check_psums(); + + ie=check_bc(0.0); + error_root(ie==0,1,"main [check5.c]","Boundary conditions changed"); + + if (my_rank==0) + { + for (ifc=0;ifc<8;ifc++) + { + if (nfc[ifc]>0) + { + printf("ifc = %d, max|sum| = %.4e, maximal deviation = %.1e\n", + ifc,psum0[ifc],psum1[ifc]); + } + } + + printf("\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..cd36ba36a838f84804ffb47d4a30da40e2401383 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/INDEX @@ -0,0 +1,22 @@ + +HMC algorithm + +check1 Check of the program set_mdsteps(). + +check2 Reversibility of the MD evolution. + +check3 Conservation of the Hamilton function by the MD evolution. + +check4 Check of add_chrono() and get_chrono(). + +check5 Comparison of rwtm*() with action1(). + +check6 Comparison of rwtm*eo() with action4(). + +The programs check5 and check6 accept the option -bc that allows the +type of boundary condition to be chosen at runtime. When the option is not +set, open boundary conditions are assumed. + +The option may be set but has no effect in the case of the other programs. In +the case of check2 and check3, the boundary conditions are selected through +the input parameter file. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..0a831c210391cb11f6f2fdb3f5ca9589b9b95a24 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/Makefile @@ -0,0 +1,168 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 check4 check5 check6 + +ARCHIVE = archive + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +DFL = dfl_geometry dfl_subspace ltl_gcr dfl_sap_gcr dfl_modes + +DIRAC = Dw_dble Dw Dw_bnd + +FLAGS = flags action_parms dfl_parms force_parms hmc_parms lat_parms \ + mdint_parms rat_parms sap_parms solver_parms + +FORCES = force0 force1 force2 force3 force4 force5 \ + frcfcts genfrc tmcg tmcgm xtensor + +LATTICE = bcnds uidx ftidx geometry + +LINALG = salg salg_dble valg valg_dble liealg cmatrix_dble cmatrix + +LINSOLV = cgne fgcr fgcr4vd mscg + +LITTLE = Aw_gen Aw_com Aw_ops Aw_dble Aw ltl_modes + +MDFLDS = mdflds fcom + +RANDOM = ranlux ranlxs ranlxd gauss + +RATFCTS = elliptic zolotarev ratfcts + +SAP = sap_com sap_gcr sap blk_solv + +SFLDS = sflds scom sdcom Pbnd Pbnd_dble + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +SW_TERM = pauli pauli_dble swflds sw_term + +TCHARGE = ftcom ftensor + +UFLDS = plaq_sum uflds udcom bstap + +UPDATE = chrono mdsteps counters mdint hmc rwrat rwtm rwtmeo + +UTILS = endian mutils utils wspace + +VFLDS = vflds vinit vcom vdcom + +MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ + $(LATTICE) $(LINALG) $(LINSOLV) $(LITTLE) $(MDFLDS) $(RANDOM) \ + $(RATFCTS) $(SAP) $(SFLDS) $(SU3FCTS) $(SW_TERM) $(TCHARGE) \ + $(UFLDS) $(UPDATE) $(UTILS) $(VFLDS) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/mdflds:$(MDIR)/su3fcts:\ + $(MDIR)/utils:$(MDIR)/forces:$(MDIR)/sflds:$(MDIR)/dirac:\ + $(MDIR)/sw_term:$(MDIR)/tcharge:$(MDIR)/block:$(MDIR)/sap:\ + $(MDIR)/linsolv:$(MDIR)/dfl:$(MDIR)/vflds:$(MDIR)/little:\ + $(MDIR)/update:$(MDIR)/ratfcts + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + +# -DMDINT_DBG -DRWRAT_DBG + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..033baccbb528de1949ace998ee53d361dcfbebe1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check1.c @@ -0,0 +1,207 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of set_mdsteps(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "mdflds.h" +#include "update.h" +#include "global.h" + +static int my_rank; +static force_t force[]={FRG,FRF_TM1,FRF_TM1_EO,FRF_TM1_EO_SDET, + FRF_TM2,FRF_TM2_EO,FRF_RAT,FRF_RAT_SDET}; + + +static void read_hmc_parms(void) +{ + int nlv; + double tau; + + if (my_rank==0) + { + find_section("HMC parameters"); + read_line("nlv","%d",&nlv); + read_line("tau","%lf",&tau); + } + + MPI_Bcast(&nlv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&tau,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + set_hmc_parms(0,NULL,0,0,NULL,nlv,tau); +} + + +static void read_integrator(void) +{ + int nlv,i,j,k,idf; + int irat[3],imu[4],isp[4],ncr[4]; + hmc_parms_t hmc; + mdint_parms_t mdp; + force_parms_t fp; + char line[NAME_SIZE]; + + for (i=0;i<3;i++) + irat[i]=0; + + for (i=0;i<4;i++) + { + imu[i]=0; + isp[i]=0; + ncr[i]=0; + } + + hmc=hmc_parms(); + nlv=hmc.nlv; + + for (i=0;i +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "archive.h" +#include "forces.h" +#include "dfl.h" +#include "update.h" +#include "global.h" + +static int my_rank; + + +static void read_lat_parms(void) +{ + int nk; + double beta,c0,csw,*kappa; + + if (my_rank==0) + { + find_section("Lattice parameters"); + read_line("beta","%lf",&beta); + read_line("c0","%lf",&c0); + nk=count_tokens("kappa"); + read_line("csw","%lf",&csw); + } + + MPI_Bcast(&beta,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&c0,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&nk,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&csw,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + if (nk>0) + { + kappa=malloc(nk*sizeof(*kappa)); + error(kappa==NULL,1,"read_lat_parms [check2.c]", + "Unable to allocate parameter array"); + if (my_rank==0) + read_dprms("kappa",nk,kappa); + MPI_Bcast(kappa,nk,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + kappa=NULL; + + set_lat_parms(beta,c0,nk,kappa,csw); + + if (nk>0) + free(kappa); +} + + +static void read_bc_parms(void) +{ + int bc; + double cG,cG_prime,cF,cF_prime; + double phi[2],phi_prime[2]; + + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + cG=1.0; + cG_prime=1.0; + cF=1.0; + cF_prime=1.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + if (bc!=3) + { + read_line("cG","%lf",&cG); + read_line("cF","%lf",&cF); + } + + if (bc==2) + { + read_line("cG'","%lf",&cG_prime); + read_line("cF'","%lf",&cF_prime); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cG,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cG_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + set_bc_parms(bc,cG,cG_prime,cF,cF_prime,phi,phi_prime); +} + + +static void read_hmc_parms(void) +{ + int nact,*iact; + int npf,nmu,nlv; + double tau,*mu; + + if (my_rank==0) + { + find_section("HMC parameters"); + nact=count_tokens("actions"); + read_line("npf","%d",&npf); + nmu=count_tokens("mu"); + read_line("nlv","%d",&nlv); + read_line("tau","%lf",&tau); + } + + MPI_Bcast(&nact,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&npf,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmu,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nlv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&tau,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + if (nact>0) + { + iact=malloc(nact*sizeof(*iact)); + error(iact==NULL,1,"read_hmc_parms [check2.c]", + "Unable to allocate temporary array"); + if (my_rank==0) + read_iprms("actions",nact,iact); + MPI_Bcast(iact,nact,MPI_INT,0,MPI_COMM_WORLD); + } + else + iact=NULL; + + if (nmu>0) + { + mu=malloc(nmu*sizeof(*mu)); + error(mu==NULL,1,"read_hmc_parms [check2.c]", + "Unable to allocate temporary array"); + if (my_rank==0) + read_dprms("mu",nmu,mu); + MPI_Bcast(mu,nmu,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + mu=NULL; + + set_hmc_parms(nact,iact,npf,nmu,mu,nlv,tau); + + if (nact>0) + free(iact); + if (nmu>0) + free(mu); +} + + +static void read_integrator(void) +{ + int nlv,i,j,k,l; + hmc_parms_t hmc; + mdint_parms_t mdp; + force_parms_t fp; + rat_parms_t rp; + + hmc=hmc_parms(); + nlv=hmc.nlv; + + for (i=0;i0)) + add2counter("modes",2,status+2); +} + + +static void start_hmc(double *act0,su3_dble *uold) +{ + int i,n,nact,*iact; + int status[3]; + double *mu; + su3_dble *udb; + dfl_parms_t dfl; + hmc_parms_t hmc; + action_parms_t ap; + + clear_counters(); + + udb=udfld(); + cm3x3_assign(4*VOLUME,udb,uold); + chs_ubnd(-1); + random_mom(); + + dfl=dfl_parms(); + + if (dfl.Ns) + { + dfl_modes(status); + error_root(status[0]<0,1,"start_hmc [hmc.c]", + "Deflation subspace generation failed (status = %d)", + status[0]); + add2counter("modes",0,status); + } + + hmc=hmc_parms(); + nact=hmc.nact; + iact=hmc.iact; + mu=hmc.mu; + n=2; + + for (i=0;idmax) + dmax=dev; + } + + return dmax; +} + + +static double max_dev_ud(su3_dble *v) +{ + double d,dmax; + su3_dble *u,*um; + + u=udfld(); + um=u+4*VOLUME; + dmax=0.0; + + for (;udmax) + dmax=d; + + v+=1; + } + + if (NPROC>1) + { + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return sqrt(dmax); +} + + +int main(int argc,char *argv[]) +{ + int first,last,step; + int nc,nsize,icnfg,nact,i; + int isap,idfl; + int nwud,nws,nwsd,nwv,nwvd; + double *act0,*act1,*act2; + double sm0[2],sm1[2],dud,dH; + double dudmin,dudmax,dudavg,dHmin,dHmax,dHavg; + su3_dble **usv; + hmc_parms_t hmc; + char cnfg_dir[NAME_SIZE],cnfg_file[NAME_SIZE]; + char nbase[NAME_SIZE]; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + fin=freopen("check2.in","r",stdin); + + printf("\n"); + printf("Reversibility of the MD evolution\n"); + printf("---------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + find_section("Configurations"); + read_line("cnfg_dir","%s",cnfg_dir); + read_line("name","%s",nbase); + read_line("first","%d",&first); + read_line("last","%d",&last); + read_line("step","%d",&step); + } + + MPI_Bcast(cnfg_dir,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(nbase,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(&first,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&last,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&step,1,MPI_INT,0,MPI_COMM_WORLD); + + read_lat_parms(); + read_bc_parms(); + read_hmc_parms(); + read_actions(); + read_integrator(); + read_solvers(); + + if (my_rank==0) + fclose(fin); + + hmc_wsize(&nwud,&nws,&nwsd,&nwv,&nwvd); + alloc_wud(nwud); + alloc_ws(nws); + alloc_wsd(nwsd); + alloc_wv(nwv); + alloc_wvd(nwvd); + usv=reserve_wud(1); + + hmc=hmc_parms(); + nact=hmc.nact; + act0=malloc(3*(nact+1)*sizeof(*act0)); + act1=act0+nact+1; + act2=act1+nact+1; + error(act0==NULL,1,"main [check2.c]","Unable to allocate action arrays"); + + print_lat_parms(); + print_bc_parms(); + print_hmc_parms(); + print_action_parms(); + print_rat_parms(); + print_mdint_parms(); + print_force_parms2(); + print_solver_parms(&isap,&idfl); + if (isap) + print_sap_parms(0); + if (idfl) + print_dfl_parms(1); + + if (my_rank==0) + { + printf("Configurations %sn%d -> %sn%d in steps of %d\n\n", + nbase,first,nbase,last,step); + fflush(flog); + } + + start_ranlux(0,1234); + geometry(); + + error_root(((last-first)%step)!=0,1,"main [check2.c]", + "last-first is not a multiple of step"); + check_dir_root(cnfg_dir); + + nsize=name_size("%s/%sn%d",cnfg_dir,nbase,last); + error_root(nsize>=NAME_SIZE,1,"main [check2.c]", + "Configuration file name is too long"); + + hmc_sanity_check(); + set_mdsteps(); + setup_counters(); + setup_chrono(); + + dudmin=0.0; + dudmax=0.0; + dudavg=0.0; + dHmin=0.0; + dHmax=0.0; + dHavg=0.0; + + for (icnfg=first;icnfg<=last;icnfg+=step) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + import_cnfg(cnfg_file); + + if (my_rank==0) + { + printf("Configuration no %d\n",icnfg); + fflush(flog); + } + + start_hmc(act0,usv[0]); + dud=max_dev_ud(usv[0]); + run_mdint(); + end_hmc(act1); + + sm0[0]=0.0; + sm0[1]=0.0; + + for (i=0;i<=nact;i++) + { + sm0[0]+=act0[i]; + sm0[1]+=(act1[i]-act0[i]); + } + + MPI_Reduce(sm0,sm1,2,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(sm1,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("start_hmc:\n"); + printf("max|U_ij-U'_ij| = %.1e\n",dud); + printf("run_mdint:\n"); + printf("H = %.6e\n",sm1[0]); + printf("dH = %.2e\n",sm1[1]); + fflush(flog); + } + + print_all_avgstat(); + + flip_mom(); + run_mdint(); + end_hmc(act2); + + sm0[0]=0.0; + sm0[1]=0.0; + + for (i=0;i<=nact;i++) + { + sm0[0]+=act2[i]; + sm0[1]+=(act2[i]-act0[i]); + } + + MPI_Reduce(sm0,sm1,2,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(sm1,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + dH=fabs(sm1[1]); + dud=max_dev_ud(usv[0]); + error_chk(); + + if (my_rank==0) + { + printf("Flip momenta and run_mdint:\n"); + printf("H = %.6e\n",sm1[0]); + printf("|dH| = % .2e\n",dH); + printf("max|U_ij-U'_ij| = %.2e\n\n",dud); + fflush(flog); + } + + if (icnfg==first) + { + dudmin=dud; + dudmax=dud; + dudavg=dud; + + dHmin=dH; + dHmax=dH; + dHavg=dH; + } + else + { + if (duddudmax) + dudmax=dud; + dudavg+=dud; + + if (dHdHmax) + dHmax=dH; + dHavg+=dH; + } + } + + if (my_rank==0) + { + nc=(last-first)/step+1; + + printf("Test summary\n"); + printf("------------\n\n"); + + printf("Considered %d configurations in the range %d -> %d\n\n", + nc,first,last); + + printf("The three figures quoted in each case are the minimal,\n"); + printf("maximal and average values\n\n"); + + printf("max|U_ij-U'_ij| = %.2e, %.2e, %.2e\n", + dudmin,dudmax,dudavg/(double)(nc)); + printf("|dH| = %.2e, %.2e, %.2e\n\n", + dHmin,dHmax,dHavg/(double)(nc)); + + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check2.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check2.in new file mode 100644 index 0000000000000000000000000000000000000000..f9bea7b590a6990e38c3ee8854dab0859aadf84f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check2.in @@ -0,0 +1,161 @@ + +[Configurations] +cnfg_dir /home/data/openQCD/cnfg +name 16x8x8x8b6.00id2 +first 7 +last 7 +step 2 + +[Lattice parameters] +beta 6.0 +c0 1.6667 +kappa 0.1300 0.12505 +csw 1.234 + +[Boundary conditions] +type 0 +#phi 0.12 -0.56 +#phi' 0.92 0.76 +cG 1.10 +#cG' 1.05 +cF 0.95 +#cF' 0.90 + +[HMC parameters] +actions 0 1 2 3 4 +npf 4 +mu 0.1 1.0 +nlv 3 +tau 0.5 + +[Action 1] +action ACF_TM1 +ipf 0 +im0 0 +imu 1 +isp 0 + +[Action 0] +action ACG + +[Action 2] +action ACF_TM2 +ipf 1 +im0 0 +imu 0 1 +isp 1 0 + +[Action 3] +action ACF_RAT_SDET +ipf 2 +im0 1 +irat 0 0 6 +isp 2 + +[Action 4] +action ACF_RAT +ipf 3 +im0 1 +irat 0 7 11 +isp 2 + +[Rational 0] +degree 12 +range 0.001 7.7 + +[Level 0] +integrator OMF4 +nstep 1 +forces 0 + +[Level 1] +integrator OMF4 +nstep 1 +forces 1 2 3 + +[Level 2] +integrator LPFR +nstep 3 +forces 4 + +[Force 0] +force FRG + +[Force 1] +force FRF_TM1 +isp 3 +ncr 0 + +[Force 2] +force FRF_TM2 +isp 4 +ncr 0 + +[Force 3] +force FRF_RAT_SDET +isp 5 + +[Force 4] +force FRF_RAT +isp 5 + +[Solver 0] +solver CGNE +nmx 256 +res 1.0e-12 + +[Solver 1] +solver SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 24 +res 1.0e-12 + +[Solver 2] +solver MSCG +nmx 256 +res 1.0e-12 + +[Solver 3] +solver CGNE +nmx 256 +res 1.0e-10 + +[Solver 4] +solver SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 24 +res 1.0e-10 + +[Solver 5] +solver MSCG +nmx 256 +res 1.0e-10 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 8 + +[Deflation subspace generation] +kappa 0.1350 +mu 0.01 +ninv 5 +nmr 4 +ncy 5 + +[Deflation projection] +nkv 16 +nmx 64 +res 1.0e-2 + +[Deflation update scheme] +dtau 0.3 +nsm 1 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..4c79ba504d2d1e6a40d37fcaa18dc9948e4d74c0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check3.c @@ -0,0 +1,812 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2005, 2007, 2009-2013 Martin Luescher, Filippo Palombi, +* Stefan Schaefer +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Conservation of the Hamilton function by the MD evolution. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "linalg.h" +#include "archive.h" +#include "forces.h" +#include "dfl.h" +#include "update.h" +#include "global.h" + +static int my_rank; + + +static void read_lat_parms(void) +{ + int nk; + double beta,c0,csw,*kappa; + + if (my_rank==0) + { + find_section("Lattice parameters"); + read_line("beta","%lf",&beta); + read_line("c0","%lf",&c0); + nk=count_tokens("kappa"); + read_line("csw","%lf",&csw); + } + + MPI_Bcast(&beta,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&c0,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&nk,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&csw,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + if (nk>0) + { + kappa=malloc(nk*sizeof(*kappa)); + error(kappa==NULL,1,"read_lat_parms [check3.c]", + "Unable to allocate parameter array"); + if (my_rank==0) + read_dprms("kappa",nk,kappa); + MPI_Bcast(kappa,nk,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + kappa=NULL; + + set_lat_parms(beta,c0,nk,kappa,csw); + + if (nk>0) + free(kappa); +} + + +static void read_bc_parms(void) +{ + int bc; + double cG,cG_prime,cF,cF_prime; + double phi[2],phi_prime[2]; + + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + cG=1.0; + cG_prime=1.0; + cF=1.0; + cF_prime=1.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + if (bc!=3) + { + read_line("cG","%lf",&cG); + read_line("cF","%lf",&cF); + } + + if (bc==2) + { + read_line("cG'","%lf",&cG_prime); + read_line("cF'","%lf",&cF_prime); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cG,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cG_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + set_bc_parms(bc,cG,cG_prime,cF,cF_prime,phi,phi_prime); +} + + +static void read_hmc_parms(void) +{ + int nact,*iact; + int npf,nmu,nlv; + double tau,*mu; + + if (my_rank==0) + { + find_section("HMC parameters"); + nact=count_tokens("actions"); + read_line("npf","%d",&npf); + nmu=count_tokens("mu"); + read_line("nlv","%d",&nlv); + read_line("tau","%lf",&tau); + } + + MPI_Bcast(&nact,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&npf,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmu,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nlv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&tau,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + if (nact>0) + { + iact=malloc(nact*sizeof(*iact)); + error(iact==NULL,1,"read_hmc_parms [check3.c]", + "Unable to allocate temporary array"); + if (my_rank==0) + read_iprms("actions",nact,iact); + MPI_Bcast(iact,nact,MPI_INT,0,MPI_COMM_WORLD); + } + else + iact=NULL; + + if (nmu>0) + { + mu=malloc(nmu*sizeof(*mu)); + error(mu==NULL,1,"read_hmc_parms [check3.c]", + "Unable to allocate temporary array"); + if (my_rank==0) + read_dprms("mu",nmu,mu); + MPI_Bcast(mu,nmu,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + mu=NULL; + + set_hmc_parms(nact,iact,npf,nmu,mu,nlv,tau); + + if (nact>0) + free(iact); + if (nmu>0) + free(mu); +} + + +static void read_integrator(void) +{ + int nlv,i,j,k,l; + hmc_parms_t hmc; + mdint_parms_t mdp; + force_parms_t fp; + rat_parms_t rp; + + hmc=hmc_parms(); + nlv=hmc.nlv; + + for (i=0;i0)) + add2counter("modes",2,status+2); +} + + +static void start_hmc(double *act0,su3_dble *uold,su3_alg_dble *mold) +{ + int i,n,nact,*iact; + int status[3]; + double *mu; + su3_dble *udb; + mdflds_t *mdfs; + dfl_parms_t dfl; + hmc_parms_t hmc; + action_parms_t ap; + + clear_counters(); + + udb=udfld(); + cm3x3_assign(4*VOLUME,udb,uold); + chs_ubnd(-1); + random_mom(); + mdfs=mdflds(); + assign_alg2alg(4*VOLUME,(*mdfs).mom,mold); + dfl=dfl_parms(); + + if (dfl.Ns) + { + dfl_modes(status); + error_root(status[0]<0,1,"start_hmc [hmc.c]", + "Deflation subspace generation failed (status = %d)", + status[0]); + add2counter("modes",0,status); + } + + hmc=hmc_parms(); + nact=hmc.nact; + iact=hmc.iact; + mu=hmc.mu; + n=2; + + for (i=0;i %sn%d in steps of %d\n\n", + nbase,first,nbase,last,step); + fflush(flog); + } + + start_ranlux(0,1234); + geometry(); + + error_root(((last-first)%step)!=0,1,"main [check3.c]", + "last-first is not a multiple of step"); + check_dir_root(cnfg_dir); + + nsize=name_size("%s/%sn%d",cnfg_dir,nbase,last); + error_root(nsize>=NAME_SIZE,1,"main [check3.c]", + "Configuration file name is too long"); + + hmc_sanity_check(); + setup_counters(); + setup_chrono(); + + for (icnfg=first;icnfg<=last;icnfg+=step) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + import_cnfg(cnfg_file); + + if (my_rank==0) + { + printf("Configuration no %d\n",icnfg); + fflush(flog); + } + + for (i=0;i<4;i++) + { + set_hmc_parms(hmc.nact,hmc.iact,hmc.npf, + hmc.nmu,hmc.mu,hmc.nlv,tau[i]); + set_mdsteps(); + + if (i==0) + start_hmc(act0,usv[0],fsv[0]); + else + restart_hmc(usv[0],fsv[0]); + + run_mdint(); + end_hmc(act1); + + sm0[0]=0.0; + sm0[1]=0.0; + sm0[2]=0.0; + + for (j=0;j<=nact;j++) + { + sm0[0]+=act0[j]; + sm0[1]+=act1[j]; + sm0[2]+=(act1[j]-act0[j]); + } + + MPI_Reduce(sm0,sm1,3,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(sm1,3,MPI_DOUBLE,0,MPI_COMM_WORLD); + dH[i]=fabs(sm1[2]); + + if (my_rank==0) + { + if (i==0) + { + printf("start_hmc:\n"); + printf("H = %.6e\n",sm1[0]); + fflush(flog); + } + + printf("run_md:\n"); + printf("tau = %.3f\n",tau[i]); + printf("H = %.6e, |dH| = %.2e\n",sm1[1],dH[i]); + fflush(flog); + } + + print_all_avgstat(); + } + + error_chk(); + + if (my_rank==0) + { + printf("\n"); + printf("tau = %.2e, |dH| = %.2e\n",tau[0],dH[0]); + + for (i=1;i<4;i++) + { + printf("tau = %.2e, |dH| = %.2e, |dH[i]|/|dH[i-1]| = %.2e\n", + tau[i],dH[i],dH[i]/dH[i-1]); + } + + printf("\n"); + printf("(From one tau to the next, the scale factor s is 4^(1/3),\n" + "i.e. s^{-3,-4,-5} = {%.2e,%.2e,%.2e})\n\n", + pow(4.0,-1.0),pow(4.0,-4.0/3.0),pow(4.0,-5.0/3.0)); + fflush(flog); + } + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check3.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check3.in new file mode 100644 index 0000000000000000000000000000000000000000..0d44690a453e510d120ef8a6783024a8ac951e8e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check3.in @@ -0,0 +1,161 @@ + +[Configurations] +cnfg_dir /home/data/openQCD/cnfg +name 16x8x8x8b6.00id2 +first 7 +last 7 +step 2 + +[Lattice parameters] +beta 6.0 +c0 1.6667 +kappa 0.1300 0.12505 +csw 1.234 + +[Boundary conditions] +type 0 +#phi 0.12 -0.56 +#phi' 0.92 0.76 +cG 1.10 +#cG' 1.05 +cF 0.95 +#cF' 0.90 + +[HMC parameters] +actions 0 1 2 3 4 +npf 4 +mu 0.1 1.0 +nlv 3 +tau 0.5 + +[Action 1] +action ACF_TM1 +ipf 0 +im0 0 +imu 1 +isp 0 + +[Action 0] +action ACG + +[Action 2] +action ACF_TM2 +ipf 1 +im0 0 +imu 0 1 +isp 1 0 + +[Action 3] +action ACF_RAT_SDET +ipf 2 +im0 1 +irat 0 0 6 +isp 2 + +[Action 4] +action ACF_RAT +ipf 3 +im0 1 +irat 0 7 11 +isp 2 + +[Rational 0] +degree 12 +range 0.001 7.7 + +[Level 0] +integrator OMF4 +nstep 1 +forces 0 + +[Level 1] +integrator OMF4 +nstep 1 +forces 1 2 3 + +[Level 2] +integrator LPFR +nstep 2 +forces 4 + +[Force 0] +force FRG + +[Force 1] +force FRF_TM1 +isp 3 +ncr 0 + +[Force 2] +force FRF_TM2 +isp 4 +ncr 0 + +[Force 3] +force FRF_RAT_SDET +isp 5 + +[Force 4] +force FRF_RAT +isp 5 + +[Solver 0] +solver CGNE +nmx 256 +res 1.0e-12 + +[Solver 1] +solver SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 24 +res 1.0e-12 + +[Solver 2] +solver MSCG +nmx 256 +res 1.0e-12 + +[Solver 3] +solver CGNE +nmx 256 +res 1.0e-10 + +[Solver 4] +solver SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 24 +res 1.0e-10 + +[Solver 5] +solver MSCG +nmx 256 +res 1.0e-10 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 8 + +[Deflation subspace generation] +kappa 0.1350 +mu 0.01 +ninv 5 +nmr 4 +ncy 5 + +[Deflation projection] +nkv 16 +nmx 64 +res 1.0e-2 + +[Deflation update scheme] +dtau 0.3 +nsm 1 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check4.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check4.c new file mode 100644 index 0000000000000000000000000000000000000000..bb22f314e42bb1e1cf62f6c8c1fd0c3c4decbb4a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check4.c @@ -0,0 +1,149 @@ + +/******************************************************************************* +* +* File check4.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of add_chrono() and get_chrono(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "linalg.h" +#include "mdflds.h" +#include "update.h" +#include "global.h" + + +static void set_psi(spinor_dble **chi,spinor_dble *psi) +{ + int i; + double t; + complex_dble z; + + t=mdtime(); + assign_sd2sd(VOLUME,chi[0],psi); + + for (i=1;i<4;i++) + { + z.re=pow(t,(double)(i)); + z.im=0.0; + mulc_spinor_add_dble(VOLUME,psi,chi[i],z); + } +} + + +int main(int argc,char *argv[]) +{ + int my_rank,i; + int nop,iop,itu; + int ncr,ifr,zero; + double phi[2],phi_prime[2]; + double kappa,mu,eps,dev; + spinor_dble **chi,**wsd; + mdstep_t *s,*sm; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check4.log","w",stdout); + + printf("\n"); + printf("Check of add_chrono() and get_chrono()\n"); + printf("--------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + } + + mu=0.5; + zero=0; + ncr=4; + + kappa=0.1365; + set_lat_parms(5.3,1.6667,1,&kappa,1.789); + phi[0]=0.378; + phi[1]=0.012; + phi_prime[0]=0.892; + phi_prime[1]=0.912; + set_bc_parms(0,1.23,1.27,0.98,1.03,phi,phi_prime); + + set_hmc_parms(0,NULL,1,1,&mu,2,2.0); + ifr=0; + set_mdint_parms(0,OMF4,0.0,1,1,&ifr); + ifr=1; + set_mdint_parms(1,OMF4,0.2,ncr,1,&ifr); + + set_force_parms(0,FRG,0,0,0,NULL,NULL,NULL); + set_force_parms(1,FRF_TM1,0,0,0,&zero,&zero,&ncr); + + print_mdint_parms(); + print_force_parms(); + + start_ranlux(0,1234); + geometry(); + alloc_wsd(6); + chi=reserve_wsd(4); + wsd=reserve_wsd(2); + + setup_chrono(); + set_mdsteps(); + s=mdsteps(&nop,&itu); + sm=s+nop; + + for (i=0;i<4;i++) + random_sd(VOLUME,chi[i],1.0); + + for (;s +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "sflds.h" +#include "linalg.h" +#include "dirac.h" +#include "sap.h" +#include "dfl.h" +#include "forces.h" +#include "update.h" +#include "global.h" + + +static double random_pf(void) +{ + mdflds_t *mdfs; + + mdfs=mdflds(); + random_sd(VOLUME,(*mdfs).pf[0],1.0); + bnd_sd2zero(ALL_PTS,(*mdfs).pf[0]); + + return norm_square_dble(VOLUME,1,(*mdfs).pf[0]); +} + + +static void divide_pf(double mu,int isp,int *status) +{ + mdflds_t *mdfs; + spinor_dble *phi,*chi,**wsd; + solver_parms_t sp; + sap_parms_t sap; + + mdfs=mdflds(); + phi=(*mdfs).pf[0]; + sp=solver_parms(isp); + + if (sp.solver==CGNE) + { + tmcg(sp.nmx,sp.res,mu,phi,phi,status); + + error_root(status[0]<0,1,"divide_pf [check5.c]", + "CGNE solver failed (parameter set no %d, status = %d)", + isp,status[0]); + + wsd=reserve_wsd(1); + chi=wsd[0]; + assign_sd2sd(VOLUME,phi,chi); + Dw_dble(-mu,chi,phi); + mulg5_dble(VOLUME,phi); + release_wsd(); + } + else if (sp.solver==SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME,phi); + sap_gcr(sp.nkv,sp.nmx,sp.res,mu,phi,phi,status); + + error_root(status[0]<0,1,"divide_pf [check5.c]", + "SAP_GCR solver failed (parameter set no %d, status = %d)", + isp,status[0]); + + set_sap_parms(sap.bs,sap.isolv,sap.nmr,sap.ncy); + } + else if (sp.solver==DFL_SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME,phi); + dfl_sap_gcr(sp.nkv,sp.nmx,sp.res,mu,phi,phi,status); + + error_root((status[0]<0)||(status[1]<0),1, + "divide_pf [check5.c]","DFL_SAP_GCR solver failed " + "(parameter set no %d, status = (%d,%d,%d))", + isp,status[0],status[1],status[2]); + + set_sap_parms(sap.bs,sap.isolv,sap.nmr,sap.ncy); + } +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,irw,isp,status[6],mnkv; + int bs[4],Ns,nmx,nkv,nmr,ncy,ninv; + double chi[2],chi_prime[2]; + double kappa,mu,res; + double mu1,mu2,act0,act1,sqn0,sqn1; + double da,ds,damx,dsmx; + solver_parms_t sp; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check5.log","w",stdout); + fin=freopen("check5.in","r",stdin); + + printf("\n"); + printf("Comparison of rwtm*() with action1()\n"); + printf("------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check6.c]", + "Syntax: check6 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.782); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + chi[0]=0.123; + chi[1]=-0.534; + chi_prime[0]=0.912; + chi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,0.953,1.203,chi,chi_prime); + print_bc_parms(); + + mnkv=0; + + for (isp=0;isp<3;isp++) + { + read_solver_parms(isp); + sp=solver_parms(isp); + + if (sp.nkv>mnkv) + mnkv=sp.nkv; + } + + if (my_rank==0) + { + find_section("SAP"); + read_line("bs","%d %d %d %d",bs,bs+1,bs+2,bs+3); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + set_sap_parms(bs,0,1,1); + + if (my_rank==0) + { + find_section("Deflation subspace"); + read_line("bs","%d %d %d %d",bs,bs+1,bs+2,bs+3); + read_line("Ns","%d",&Ns); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_parms(bs,Ns); + + if (my_rank==0) + { + find_section("Deflation subspace generation"); + read_line("kappa","%lf",&kappa); + read_line("mu","%lf",&mu); + read_line("ninv","%d",&ninv); + read_line("nmr","%d",&nmr); + read_line("ncy","%d",&ncy); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&ninv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_gen_parms(kappa,mu,ninv,nmr,ncy); + + if (my_rank==0) + { + find_section("Deflation projection"); + read_line("nkv","%d",&nkv); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + fclose(fin); + } + + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + set_dfl_pro_parms(nkv,nmx,res); + set_hmc_parms(0,NULL,1,0,NULL,1,1.0); + + print_solver_parms(status,status+1); + print_sap_parms(0); + print_dfl_parms(0); + + start_ranlux(0,1245); + geometry(); + + mnkv=2*mnkv+2; + if (mnkv<(Ns+2)) + mnkv=Ns+2; + if (mnkv<5) + mnkv=5; + + alloc_ws(mnkv); + alloc_wsd(6); + alloc_wv(2*nkv+2); + alloc_wvd(4); + damx=0.0; + dsmx=0.0; + + for (irw=1;irw<5;irw++) + { + for (isp=0;isp<3;isp++) + { + if (isp==0) + { + set_sw_parms(1.0877); + if (irw<3) + mu1=1.0; + else + mu1=0.0; + mu2=1.23; + } + else if (isp==1) + { + set_sw_parms(0.0877); + if (irw<3) + mu1=0.1; + else + mu1=0.0; + mu2=0.123; + } + else + { + set_sw_parms(-0.0123); + if (irw<3) + mu1=0.01; + else + mu1=0.0; + mu2=0.0123; + } + + random_ud(); + chs_ubnd(-1); + + if (isp==2) + { + dfl_modes(status); + error_root(status[0]<0,1,"main [check5.c]", + "dfl_modes failed"); + } + + start_ranlux(0,8910+isp); + sqn0=random_pf(); + + if ((irw&0x1)==1) + act0=(mu2*mu2-mu1*mu1)*action1(mu1,0,isp,1,status); + else + { + if ((isp==0)||(isp==1)) + divide_pf(mu1,isp,status+1); + else + divide_pf(mu1,isp,status+3); + + act0=mu1*mu1*(mu2*mu2-mu1*mu1)*action1(mu1,0,isp,1,status); + act0+=2.0*mu2*mu2*mu2*mu2*action1(sqrt(2.0)*mu2,0,isp,1,status); + act0*=((mu2*mu2-mu1*mu1)/(2*mu2*mu2-mu1*mu1)); + } + + if (my_rank==0) + { + printf("Solver number %d, mu1 = %.2e, mu2 = %.2e\n",isp,mu1,mu2); + printf("action1(): "); + + if ((isp==0)||(isp==1)) + printf("status = %d\n",status[0]); + else if (isp==2) + printf("status = (%d,%d,%d)\n", + status[0],status[1],status[2]); + } + + start_ranlux(0,8910+isp); + + if ((irw&0x1)==1) + act1=rwtm1(mu1,mu2,isp,&sqn1,status); + else + act1=rwtm2(mu1,mu2,isp,&sqn1,status); + + da=fabs(1.0-act1/act0); + ds=fabs(1.0-sqn1/sqn0); + + if (da>damx) + damx=da; + if (ds>dsmx) + dsmx=ds; + + if (my_rank==0) + { + if ((irw&0x1)==1) + { + printf("rwtm1(): "); + + if ((isp==0)||(isp==1)) + printf("status = %d\n",status[0]); + else if (isp==2) + printf("status = (%d,%d,%d)\n", + status[0],status[1],status[2]); + } + else + { + printf("rwtm2(): "); + + if ((isp==0)||(isp==1)) + printf("status = %d,%d\n",status[0],status[1]); + else if (isp==2) + printf("status = (%d,%d,%d),(%d,%d,%d)\n", + status[0],status[1],status[2],status[3], + status[4],status[5]); + } + + printf("|1-act1/act0| = %.1e, |1-sqn1/sqn0| = %.1e\n\n",da,ds); + } + + error_chk(); + } + } + + if (my_rank==0) + { + printf("max|1-act1/act0| = %.1e, max|1-sqn1/sqn0| = %.1e\n\n",damx,dsmx); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check5.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check5.in new file mode 100644 index 0000000000000000000000000000000000000000..ff9c44f39aa004b13ba19ba377713fad35faaa11 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check5.in @@ -0,0 +1,43 @@ + +[Solver 0] +solver CGNE +nmx 256 +res 1.0e-12 + +[Solver 1] +solver SAP_GCR +nmx 128 +nkv 16 +isolv 0 +nmr 4 +ncy 3 +res 1.0e-12 + +[Solver 2] +solver DFL_SAP_GCR +nmx 64 +nkv 16 +isolv 1 +nmr 4 +ncy 5 +res 1.0e-12 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 8 + +[Deflation subspace generation] +kappa 0.1350 +mu 0.01 +ninv 5 +nmr 4 +ncy 5 + +[Deflation projection] +nkv 16 +nmx 64 +res 1.0e-2 + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check6.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check6.c new file mode 100644 index 0000000000000000000000000000000000000000..217a79f1a9d6a93792d028854f1aeac9f1da259c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check6.c @@ -0,0 +1,370 @@ + +/******************************************************************************* +* +* File check6.c +* +* Copyright (C) 2012-2014 Stefan Schaefer, Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Comparison of rwtm*eo() with action4(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "sflds.h" +#include "linalg.h" +#include "dirac.h" +#include "sap.h" +#include "dfl.h" +#include "forces.h" +#include "update.h" +#include "global.h" + + +static double random_pf(void) +{ + mdflds_t *mdfs; + + mdfs=mdflds(); + random_sd(VOLUME/2,(*mdfs).pf[0],1.0); + + set_sd2zero(VOLUME/2,(*mdfs).pf[0]+VOLUME/2); + bnd_sd2zero(ALL_PTS,(*mdfs).pf[0]); + + return norm_square_dble(VOLUME/2,1,(*mdfs).pf[0]); +} + + +static void divide_pf(double mu,int isp,int *status) +{ + mdflds_t *mdfs; + spinor_dble *phi,*chi,**wsd; + solver_parms_t sp; + sap_parms_t sap; + tm_parms_t tm; + + tm=tm_parms(); + if (tm.eoflg!=1) + set_tm_parms(1); + + mdfs=mdflds(); + phi=(*mdfs).pf[0]; + sp=solver_parms(isp); + + if (sp.solver==CGNE) + { + tmcgeo(sp.nmx,sp.res,mu,phi,phi,status); + + error_root(status[0]<0,1,"divide_pf [check6.c]", + "CGNE solver failed (parameter set no %d, status = %d)", + isp,status[0]); + + wsd=reserve_wsd(1); + chi=wsd[0]; + assign_sd2sd(VOLUME/2,phi,chi); + Dwhat_dble(-mu,chi,phi); + mulg5_dble(VOLUME/2,phi); + set_sd2zero(VOLUME/2,phi+VOLUME/2); + release_wsd(); + } + else if (sp.solver==SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME/2,phi); + set_sd2zero(VOLUME/2,phi+VOLUME/2); + sap_gcr(sp.nkv,sp.nmx,sp.res,mu,phi,phi,status); + set_sd2zero(VOLUME/2,phi+VOLUME/2); + + error_root(status[0]<0,1,"divide_pf [check6.c]", + "SAP_GCR solver failed (parameter set no %d, status = %d)", + isp,status[0]); + } + else if (sp.solver==DFL_SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME/2,phi); + set_sd2zero(VOLUME/2,phi+VOLUME/2); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu,phi,phi,status); + set_sd2zero(VOLUME/2,phi+VOLUME/2); + + error_root((status[0]<0)||(status[1]<0),1, + "divide_pf [check6.c]","DFL_SAP_GCR solver failed " + "(parameter set no %d, status = (%d,%d,%d))", + isp,status[0],status[1],status[2]); + } +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,irw,isp,status[6],mnkv; + int bs[4],Ns,nmx,nkv,nmr,ncy,ninv; + double chi[2],chi_prime[2]; + double kappa,mu,res; + double mu1,mu2,act0,act1,sqn0,sqn1; + double da,ds,damx,dsmx; + solver_parms_t sp; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check6.log","w",stdout); + fin=freopen("check6.in","r",stdin); + + printf("\n"); + printf("Comparison of rwtm*eo() with action4()\n"); + printf("--------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check6.c]", + "Syntax: check6 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.782); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + chi[0]=0.123; + chi[1]=-0.534; + chi_prime[0]=0.912; + chi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,0.953,1.203,chi,chi_prime); + print_bc_parms(); + + mnkv=0; + + for (isp=0;isp<3;isp++) + { + read_solver_parms(isp); + sp=solver_parms(isp); + + if (sp.nkv>mnkv) + mnkv=sp.nkv; + } + + if (my_rank==0) + { + find_section("SAP"); + read_line("bs","%d %d %d %d",bs,bs+1,bs+2,bs+3); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + set_sap_parms(bs,0,1,1); + + if (my_rank==0) + { + find_section("Deflation subspace"); + read_line("bs","%d %d %d %d",bs,bs+1,bs+2,bs+3); + read_line("Ns","%d",&Ns); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_parms(bs,Ns); + + if (my_rank==0) + { + find_section("Deflation subspace generation"); + read_line("kappa","%lf",&kappa); + read_line("mu","%lf",&mu); + read_line("ninv","%d",&ninv); + read_line("nmr","%d",&nmr); + read_line("ncy","%d",&ncy); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&ninv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_gen_parms(kappa,mu,ninv,nmr,ncy); + + if (my_rank==0) + { + find_section("Deflation projection"); + read_line("nkv","%d",&nkv); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + fclose(fin); + } + + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + set_dfl_pro_parms(nkv,nmx,res); + set_hmc_parms(0,NULL,1,0,NULL,1,1.0); + + print_solver_parms(status,status+1); + print_sap_parms(0); + print_dfl_parms(0); + + start_ranlux(0,1245); + geometry(); + + mnkv=2*mnkv+2; + if (mnkv<(Ns+2)) + mnkv=Ns+2; + if (mnkv<5) + mnkv=5; + + alloc_ws(mnkv); + alloc_wsd(6); + alloc_wv(2*nkv+2); + alloc_wvd(4); + damx=0.0; + dsmx=0.0; + + for (irw=1;irw<5;irw++) + { + for (isp=0;isp<3;isp++) + { + if (isp==0) + { + set_sw_parms(1.0877); + if (irw<3) + mu1=1.0; + else + mu1=0.0; + mu2=1.23; + } + else if (isp==1) + { + set_sw_parms(0.0877); + if (irw<3) + mu1=0.1; + else + mu1=0.0; + mu2=0.123; + } + else + { + set_sw_parms(-0.0123); + if (irw<3) + mu1=0.01; + else + mu1=0.0; + mu2=0.0123; + } + + random_ud(); + chs_ubnd(-1); + + if (isp==2) + { + dfl_modes(status); + error_root(status[0]<0,1,"main [check6.c]", + "dfl_modes failed"); + } + + start_ranlux(0,8910+isp); + sqn0=random_pf(); + + if ((irw&0x1)==1) + act0=(mu2*mu2-mu1*mu1)*action4(mu1,0,0,isp,1,status); + else + { + if ((isp==0)||(isp==1)) + divide_pf(mu1,isp,status+1); + else + divide_pf(mu1,isp,status+3); + + act0=mu1*mu1*(mu2*mu2-mu1*mu1)*action4(mu1,0,0,isp,1,status); + act0+=2.0*mu2*mu2*mu2*mu2*action4(sqrt(2.0)*mu2,0,0,isp,1,status); + act0*=((mu2*mu2-mu1*mu1)/(2*mu2*mu2-mu1*mu1)); + } + + if (my_rank==0) + { + printf("Solver number %d, mu1 = %.2e, mu2 = %.2e\n",isp,mu1,mu2); + printf("action4(): "); + + if ((isp==0)||(isp==1)) + printf("status = %d\n",status[0]); + else if (isp==2) + printf("status = (%d,%d,%d)\n", + status[0],status[1],status[2]); + } + + start_ranlux(0,8910+isp); + + if ((irw&0x1)==1) + act1=rwtm1eo(mu1,mu2,isp,&sqn1,status); + else + act1=rwtm2eo(mu1,mu2,isp,&sqn1,status); + + da=fabs(1.0-act1/act0); + ds=fabs(1.0-sqn1/sqn0); + + if (da>damx) + damx=da; + if (ds>dsmx) + dsmx=ds; + + if (my_rank==0) + { + if ((irw&0x1)==1) + { + printf("rwtm1eo(): "); + + if ((isp==0)||(isp==1)) + printf("status = %d\n",status[0]); + else if (isp==2) + printf("status = (%d,%d,%d)\n", + status[0],status[1],status[2]); + } + else + { + printf("rwtm2eo(): "); + + if ((isp==0)||(isp==1)) + printf("status = %d,%d\n",status[0],status[1]); + else if (isp==2) + printf("status = (%d,%d,%d),(%d,%d,%d)\n", + status[0],status[1],status[2],status[3], + status[4],status[5]); + } + + printf("|1-act1/act0| = %.1e, |1-sqn1/sqn0| = %.1e\n\n",da,ds); + } + + error_chk(); + } + } + + if (my_rank==0) + { + printf("max|1-act1/act0| = %.1e, max|1-sqn1/sqn0| = %.1e\n\n",damx,dsmx); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check6.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check6.in new file mode 100644 index 0000000000000000000000000000000000000000..e7981ba77309d6d191c28b2c21d276e222934314 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check6.in @@ -0,0 +1,42 @@ + +[Solver 0] +solver CGNE +nmx 256 +res 1.0e-12 + +[Solver 1] +solver SAP_GCR +nmx 128 +nkv 16 +isolv 0 +nmr 4 +ncy 3 +res 1.0e-12 + +[Solver 2] +solver DFL_SAP_GCR +nmx 64 +nkv 16 +isolv 1 +nmr 4 +ncy 5 +res 1.0e-12 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 8 + +[Deflation subspace generation] +kappa 0.1350 +mu 0.01 +ninv 5 +nmr 4 +ncy 5 + +[Deflation projection] +nkv 16 +nmx 64 +res 1.0e-2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/utils/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/utils/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..d0738d2892913e8953f0f62c702b0c40a634f19d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/utils/INDEX @@ -0,0 +1,5 @@ + +Utility programs + +check1 Copying of files + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/utils/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/utils/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..ba45c0c5f9a62140f5c10d22cc1b9065941d172b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/utils/Makefile @@ -0,0 +1,121 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 + +FLAGS = flags lat_parms + +LATTICE = geometry + +RANDOM = ranlux ranlxs ranlxd + +UTILS = endian mutils utils + +MODULES = $(FLAGS) $(LATTICE) $(RANDOM) $(UTILS) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/random:$(MDIR)/utils + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog \ + *.log~ *.dat *.dat~ $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/utils/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/utils/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..708515b3d6dbdfb26164df1d6e0e4c37dbe45a8a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/utils/check1.c @@ -0,0 +1,91 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2005, 2008 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Copying of files. After running this program, one can verify that all +* bytes have been copied correctly using the diff utility +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "archive.h" +#include "global.h" + +#define NRAN 10000 + +static float r[NRAN]; + + +int main(int argc,char *argv[]) +{ + int my_rank,n,err1,err2,iw; + FILE *flog=NULL,*fdat=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + + printf("\n"); + printf("Copying of .log and .dat files from process 0\n"); + printf("---------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + } + + start_ranlux(0,1234); + ranlxs(r,NRAN); + + if (my_rank==0) + { + printf("Write 10 random numbers to check1.log (in asci format)\n"); + printf("and %d numbers to check1.dat (in binary format)\n\n",NRAN); + + fdat=fopen("check1.dat","wb"); + iw=fwrite(&r[0],sizeof(float),NRAN,fdat); + error_root(iw!=NRAN,1,"main [check1.c]","Incorrect write count"); + fclose(fdat); + + for (n=0;n<10;n++) + printf("r[%d] = %.6e\n",n,r[n]); + + printf("\n"); + printf("Copy the files to check1.log~ and check1.dat~ respectively.\n"); + printf("The copying may then be verified using the diff utility\n\n"); + fclose(flog); + + err1=copy_file("check1.log","check1.log~"); + err2=copy_file("check1.dat","check1.dat~"); + + flog=freopen("check1.log","a",stdout); + + if ((err1!=0)||(err2!=0)) + printf("Copying failed: err1 = %d, err2 = %d\n",err1,err2); + } + + error_chk(); + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..f726c1f53ede4ea9d83143106ae471a3fa4371db --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/INDEX @@ -0,0 +1,19 @@ + +Basic utility programs for complex fields + +check1 Allocation and initialization of the global vector fields. + +check2 Check of the programs in the module vinit.c. + +check3 Check of the communication programs cpv_int_bnd() and + cpv_ext_bnd(). + +check4 Check of the communication programs cpvd_int_bnd() and + cpvd_ext_bnd(). + +The programs check3 and check4 accept the option -bc that allows the +type of boundary condition to be chosen at runtime. When the option is not +set, open boundary conditions are assumed. + +The option may be set but has no effect in the case of check1 and check2. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..b51c6c4e5f10f399b42efccacf0135e365a13584 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/Makefile @@ -0,0 +1,151 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 check4 + +FLAGS = flags lat_parms sap_parms dfl_parms + +LATTICE = bcnds ftidx uidx geometry + +LINALG = salg salg_dble valg valg_dble liealg cmatrix_dble + +LINSOLV = fgcr + +RANDOM = ranlux ranlxs ranlxd gauss + +UFLDS = plaq_sum shift uflds udcom + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +UTILS = endian mutils utils wspace + +SFLDS = sflds scom sdcom Pbnd Pbnd_dble + +TCHARGE = ftcom ftensor + +SW_TERM = pauli pauli_dble swflds sw_term + +DIRAC = Dw_dble Dw Dw_bnd + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +SAP = blk_solv sap_com sap sap_gcr + +ARCHIVE = archive + +DFL = dfl_geometry + +VFLDS = vflds vinit vcom vdcom + +MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ + $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) $(DIRAC) \ + $(BLOCK) $(SAP) $(ARCHIVE) $(DFL) $(VFLDS) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/linsolv:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:\ + $(MDIR)/sflds:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:\ + $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive:$(MDIR)/dfl:\ + $(MDIR)/vflds + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..c802aadb61cf807b8719678f699bbffe4e9249a6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/check1.c @@ -0,0 +1,165 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2007, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Allocation and initialization of the global vector fields +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "vflds.h" +#include "global.h" + +#define NFIELDS 7 + + +int main(int argc,char *argv[]) +{ + int my_rank,ie,k,ix; + int bs[4],Ns; + int nb,nbb,nv,nvec; + complex **wv; + complex_dble **wvd; + dfl_parms_t dfl; + FILE *fin=NULL,*flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + fin=freopen("check1.in","r",stdin); + + printf("\n"); + printf("Allocation and initialization of the global vector fields\n"); + printf("---------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + read_line("Ns","%d",&Ns); + fclose(fin); + + printf("bs = %d %d %d %d\n",bs[0],bs[1],bs[2],bs[3]); + printf("Ns = %d\n\n",Ns); + fflush(flog); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + + start_ranlux(0,123456); + geometry(); + dfl=set_dfl_parms(bs,Ns); + + error((bs[0]!=dfl.bs[0])||(bs[1]!=dfl.bs[1])|| + (bs[2]!=dfl.bs[2])||(bs[3]!=dfl.bs[3])||(Ns!=dfl.Ns),1, + "main [check1.c]","Parameter bs[4] or Ns are incorrectly set"); + + alloc_wv(NFIELDS); + alloc_wvd(NFIELDS); + wv=reserve_wv(NFIELDS); + wvd=reserve_wvd(NFIELDS); + + nb=VOLUME/(bs[0]*bs[1]*bs[2]*bs[3]); + nbb=(FACE0/(bs[1]*bs[2]*bs[3])+FACE1/(bs[0]*bs[2]*bs[3])+ + FACE2/(bs[0]*bs[1]*bs[3])+FACE3/(bs[0]*bs[1]*bs[2])); + nv=Ns*nb; + nvec=Ns*(nb+nbb); + ie=0; + + for (k=1;k +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "linalg.h" +#include "vflds.h" +#include "global.h" + +#define NFLDS 5 + +static float sig[NFLDS]; +static double sigd[NFLDS]; + + +int main(int argc,char *argv[]) +{ + int my_rank,ie,k,ix; + int bs[4],Ns,nb,nv; + double var,var_all,d,dmax; + complex z; + complex_dble zd; + complex **wv; + complex_dble **wvd; + FILE *fin=NULL,*flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + fin=freopen("check1.in","r",stdin); + + printf("\n"); + printf("Check of the programs in the module vinit\n"); + printf("-----------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + read_line("Ns","%d",&Ns); + fclose(fin); + + printf("bs = %d %d %d %d\n",bs[0],bs[1],bs[2],bs[3]); + printf("Ns = %d\n\n",Ns); + fflush(flog); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + + start_ranlux(0,12345); + geometry(); + set_dfl_parms(bs,Ns); + + alloc_wv(2*NFLDS); + alloc_wvd(2*NFLDS); + wv=reserve_wv(2*NFLDS); + wvd=reserve_wvd(2*NFLDS); + + nb=VOLUME/(bs[0]*bs[1]*bs[2]*bs[3]); + nv=Ns*nb; + z.im=0.0f; + zd.im=0.0; + ie=0; + + if (my_rank==0) + { + printf("Choose random single-precision fields\n"); + ranlxs(sig,NFLDS); + } + + MPI_Bcast(sig,NFLDS,MPI_FLOAT,0,MPI_COMM_WORLD); + + for (k=0;k = %.4e (sigma^2 = %.4e)\n", + k,var_all,sig[k]*sig[k]); + } + } + + if (my_rank==0) + { + printf("\n"); + printf("Choose random double-precision fields\n"); + ranlxd(sigd,NFLDS); + } + + MPI_Bcast(sigd,NFLDS,MPI_DOUBLE,0,MPI_COMM_WORLD); + + for (k=0;k = %.4e (sigma^2 = %.4e)\n", + k,var_all,sigd[k]*sigd[k]); + } + } + + for (k=0;kdmax) + dmax=d; + } + + if (my_rank==0) + { + printf("\n"); + printf("Relative deviations (should be less than 1.0e-7 or so):\n"); + printf("add_v2vd(): %.1e\n",sqrt(dmax)); + } + + dmax=0.0; + + for (k=0;kdmax) + dmax=d; + } + + if (my_rank==0) + { + printf("diff_vd2v(): %.1e\n\n",sqrt(dmax)); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..b252a15a9f4f127a3d57f90901cf24ee14e1922f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/check3.c @@ -0,0 +1,341 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2007, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the communication programs cpv_int_bnd() and cpv_ext_bnd(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "linalg.h" +#include "dfl.h" +#include "vflds.h" +#include "global.h" + +static int bs[4],Ns,nv,nvec; +static int nb,nbb,*nbbe,*nbbo,*obbe,*obbo; +static int (*inn)[8],*ipp; + + +static void set_field(complex *v) +{ + int n[4],no[4],c[4]; + int i0,i1,i2,i3,ibe,ibo; + + n[0]=L0/bs[0]; + n[1]=L1/bs[1]; + n[2]=L2/bs[2]; + n[3]=L3/bs[3]; + + no[0]=cpr[0]*n[0]; + no[1]=cpr[1]*n[1]; + no[2]=cpr[2]*n[2]; + no[3]=cpr[3]*n[3]; + + set_v2zero(nv,v); + ibe=0; + ibo=(n[0]*n[1]*n[2]*n[3])/2; + + for (i0=0;i01)|| + ((ifc==0)&&(cpr[0]!=0))|| + ((ifc==1)&&(cpr[0]!=(NPROC0-1)))|| + (bc==3)) + { + for (ibb=obbe[ifc];ibb<(obbe[ifc]+nbbe[ifc]);ibb++) + { + ib=ipp[ibb]; + + for (mu=0;mu<4;mu++) + { + c[mu]=v[nv+ibb*Ns+mu].re-v[ib*Ns+mu].re; + + if (mu==(ifc/2)) + { + if ((ifc&0x1)==0x0) + { + c[mu]+=1.0f; + + if (cpr[mu]==0) + c[mu]-=n[mu]; + } + else + { + c[mu]-=1.0f; + + if (cpr[mu]==(np[mu]-1)) + c[mu]+=n[mu]; + } + } + } + + if ((c[0]!=0.0f)||(c[1]!=0.0f)||(c[2]!=0.0f)||(c[3]!=0.0f)) + ie=1; + } + } + else + { + for (ibb=obbe[ifc];ibb<(obbe[ifc]+nbbe[ifc]);ibb++) + { + for (i=0;i1)|| + ((ifc==0)&&(cpr[0]!=0))|| + ((ifc==1)&&(cpr[0]!=(NPROC0-1)))|| + (bc==3)) + { + for (ibb=obbo[ifc];ibb<(obbo[ifc]+nbbo[ifc]);ibb++) + { + ib=ipp[ibb]; + vv=v+ib*Ns; + ww=w+ib*Ns; + vm=vv+Ns; + + for (;vv]"); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + Ns=6; + set_dfl_parms(bs,Ns); + + dfl_grid=dfl_geometry(); + nb=dfl_grid.nb; + nbb=dfl_grid.nbb; + nbbe=dfl_grid.nbbe; + nbbo=dfl_grid.nbbo; + obbe=dfl_grid.obbe; + obbo=dfl_grid.obbo; + inn=dfl_grid.inn; + ipp=dfl_grid.ipp; + + alloc_wv(4); + wv=reserve_wv(4); + + nv=Ns*nb; + nvec=Ns*(nb+nbb/2); + z.re=-1.0f; + z.im=0.0f; + + for (i=0;i<2;i++) + { + random_v(nvec,wv[i],1.0f); + set_field(wv[i]); + assign_v2v(nv,wv[i],wv[i+1]); + cpv_int_bnd(wv[i]); + mulc_vadd(nv,wv[i+1],wv[i],z); + d=vnorm_square(nv,1,wv[i+1]); + + error_root(d!=0.0f,1,"main [check3.c]", + "cpv_int_bnd() modifies the input field on the local grid"); + + ie=chk_ext_bnd(wv[i]); + error(ie==1,1,"main [check3.c]", + "Boundary values are incorrectly mapped by cpv_int_bnd()"); + error(ie==2,1,"main [check3.c]", + "Boundary values are not set to zero where they should"); + + random_iv(nvec,wv[i]); + cpv_int_bnd(wv[i]); + assign_v2v(nvec,wv[i],wv[i+1]); + cpv_ext_bnd(wv[i]); + mulc_vadd(nvec-nv,wv[i]+nv,wv[i+1]+nv,z); + d=vnorm_square(nvec-nv,1,wv[i]+nv); + + error_root(d!=0.0f,1,"main [check3.c]", + "cpv_ext_bnd() modifies the input field on the boundary"); + + ie=chk_int_bnd(wv[i],wv[i+1]); + error(ie==1,1,"main [check3.c]", + "Boundary values are incorrectly mapped by cpv_ext_bnd()"); + } + + error_chk(); + + if (my_rank==0) + { + printf("No errors detected\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/check4.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/check4.c new file mode 100644 index 0000000000000000000000000000000000000000..236aecd00baf208d11e50c29b1000343905bf2be --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/check4.c @@ -0,0 +1,342 @@ + +/******************************************************************************* +* +* File check4.c +* +* Copyright (C) 2007, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the communication programs cpvd_int_bnd() and cpvd_ext_bnd(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "linalg.h" +#include "dfl.h" +#include "vflds.h" +#include "global.h" + +static int bs[4],Ns,nv,nvec; +static int nb,nbb,*nbbe,*nbbo,*obbe,*obbo; +static int (*inn)[8],*ipp; + + +static void set_field(complex_dble *v) +{ + int n[4],no[4],c[4]; + int i0,i1,i2,i3,ibe,ibo; + + n[0]=L0/bs[0]; + n[1]=L1/bs[1]; + n[2]=L2/bs[2]; + n[3]=L3/bs[3]; + + no[0]=cpr[0]*n[0]; + no[1]=cpr[1]*n[1]; + no[2]=cpr[2]*n[2]; + no[3]=cpr[3]*n[3]; + + set_vd2zero(nv,v); + ibe=0; + ibo=(n[0]*n[1]*n[2]*n[3])/2; + + for (i0=0;i01)|| + ((ifc==0)&&(cpr[0]!=0))|| + ((ifc==1)&&(cpr[0]!=(NPROC0-1)))|| + (bc==3)) + { + for (ibb=obbe[ifc];ibb<(obbe[ifc]+nbbe[ifc]);ibb++) + { + ib=ipp[ibb]; + + for (mu=0;mu<4;mu++) + { + c[mu]=v[nv+ibb*Ns+mu].re-v[ib*Ns+mu].re; + + if (mu==(ifc/2)) + { + if ((ifc&0x1)==0x0) + { + c[mu]+=1.0; + + if (cpr[mu]==0) + c[mu]-=n[mu]; + } + else + { + c[mu]-=1.0; + + if (cpr[mu]==(np[mu]-1)) + c[mu]+=n[mu]; + } + } + } + + if ((c[0]!=0.0)||(c[1]!=0.0)||(c[2]!=0.0)||(c[3]!=0.0)) + ie=1; + } + } + else + { + for (ibb=obbe[ifc];ibb<(obbe[ifc]+nbbe[ifc]);ibb++) + { + for (i=0;i1)|| + ((ifc==0)&&(cpr[0]!=0))|| + ((ifc==1)&&(cpr[0]!=(NPROC0-1))) + ||(bc==3)) + { + for (ibb=obbo[ifc];ibb<(obbo[ifc]+nbbo[ifc]);ibb++) + { + ib=ipp[ibb]; + vv=v+ib*Ns; + ww=w+ib*Ns; + vm=vv+Ns; + + for (;vv]"); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + Ns=4; + set_dfl_parms(bs,Ns); + + dfl_grid=dfl_geometry(); + nb=dfl_grid.nb; + nbb=dfl_grid.nbb; + nbbe=dfl_grid.nbbe; + nbbo=dfl_grid.nbbo; + obbe=dfl_grid.obbe; + obbo=dfl_grid.obbo; + inn=dfl_grid.inn; + ipp=dfl_grid.ipp; + + alloc_wvd(4); + wv=reserve_wvd(4); + + nv=Ns*nb; + nvec=Ns*(nb+nbb/2); + z.re=-1.0; + z.im=0.0; + + for (i=0;i<2;i++) + { + random_vd(nvec,wv[i],1.0); + set_field(wv[i]); + assign_vd2vd(nv,wv[i],wv[i+1]); + cpvd_int_bnd(wv[i]); + mulc_vadd_dble(nv,wv[i+1],wv[i],z); + d=vnorm_square_dble(nv,1,wv[i+1]); + + error_root(d!=0.0,1,"main [check4.c]", + "cpvd_int_bnd() modifies the input field on the local grid"); + + ie=chk_ext_bnd(wv[i]); + error(ie==1,1,"main [check4.c]", + "Boundary values are incorrectly mapped by cpvd_int_bnd()"); + error(ie==2,1,"main [check3.c]", + "Boundary values are not set to zero where they should"); + + random_iv(nvec,wv[i]); + cpvd_int_bnd(wv[i]); + assign_vd2vd(nvec,wv[i],wv[i+1]); + cpvd_ext_bnd(wv[i]); + mulc_vadd_dble(nvec-nv,wv[i]+nv,wv[i+1]+nv,z); + d=vnorm_square_dble(nvec-nv,1,wv[i]+nv); + + error_root(d!=0.0,1,"main [check4.c]", + "cpvd_ext_bnd() modifies the input field on the boundary"); + + ie=chk_int_bnd(wv[i],wv[i+1]); + error(ie==1,1,"main [check4.c]", + "Boundary values are incorrectly mapped by cpvd_ext_bnd()"); + } + + error_chk(); + + if (my_rank==0) + { + printf("No errors detected\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..3a15a74deee7765c78faba902b6555709861b920 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/INDEX @@ -0,0 +1,14 @@ + +Integration of the Wilson flow + +check1 Basic checks on the implementation of the Wilson flow. + +check2 Gauge covariance of the Wilson flow. + +check3 Convergence of the numerical integration. + +The programs check1 and check2 accept the option -bc that allows the +type of boundary condition to be chosen (open boundary conditions are assumed +if the option is not set). In the case of check3, the boundary conditions are +set through the input parameter file. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..5aa4b4baa64aabc0d4499f62c05f141575ec2b82 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/Makefile @@ -0,0 +1,143 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 + +FLAGS = flags lat_parms hmc_parms dfl_parms + +LATTICE = bcnds uidx ftidx geometry + +ARCHIVE = archive + +LINALG = liealg cmatrix_dble + +RANDOM = ranlux ranlxs ranlxd gauss random_su3 + +UFLDS = plaq_sum shift uflds udcom bstap + +MDFLDS = mdflds fcom + +SFLDS = sflds + +SU3FCTS = chexp su3prod su3ren cm3x3 + +UTILS = endian mutils utils wspace + +FORCES = force0 + +TCHARGE = ftcom ftensor tcharge + +WFLOW = wflow + +MODULES = $(FLAGS) $(LATTICE) $(ARCHIVE) $(LINALG) $(RANDOM) $(UFLDS) \ + $(MDFLDS) $(SFLDS) $(SU3FCTS) $(UTILS) $(FORCES) $(TCHARGE) \ + $(WFLOW) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/mdflds:$(MDIR)/sflds:\ + $(MDIR)/su3fcts:$(MDIR)/utils:$(MDIR)/forces:$(MDIR)/tcharge:\ + $(MDIR)/wflow + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..527d6c0cbee80c59b5d5b123b9d78ba1cac62363 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check1.c @@ -0,0 +1,552 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2010-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Basic checks on the implementation of the Wilson flow. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "linalg.h" +#include "forces.h" +#include "wflow.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static const su3_alg_dble fr0={0.0}; +static su3_alg_dble XX ALIGNED16; +static su3_dble mm,uu,vv ALIGNED16; + + +static double cmp_ud(su3_dble *u,su3_dble *v) +{ + int i; + double r[18],dev,dmax; + + r[ 0]=(*u).c11.re-(*v).c11.re; + r[ 1]=(*u).c11.im-(*v).c11.im; + r[ 2]=(*u).c12.re-(*v).c12.re; + r[ 3]=(*u).c12.im-(*v).c12.im; + r[ 4]=(*u).c13.re-(*v).c13.re; + r[ 5]=(*u).c13.im-(*v).c13.im; + + r[ 6]=(*u).c21.re-(*v).c21.re; + r[ 7]=(*u).c21.im-(*v).c21.im; + r[ 8]=(*u).c22.re-(*v).c22.re; + r[ 9]=(*u).c22.im-(*v).c22.im; + r[10]=(*u).c23.re-(*v).c23.re; + r[11]=(*u).c23.im-(*v).c23.im; + + r[12]=(*u).c31.re-(*v).c31.re; + r[13]=(*u).c31.im-(*v).c31.im; + r[14]=(*u).c32.re-(*v).c32.re; + r[15]=(*u).c32.im-(*v).c32.im; + r[16]=(*u).c33.re-(*v).c33.re; + r[17]=(*u).c33.im-(*v).c33.im; + + dmax=0.0; + + for (i=0;i<18;i+=2) + { + dev=r[i]*r[i]+r[i+1]*r[i+1]; + if (dev>dmax) + dmax=dev; + } + + return sqrt(dmax); +} + + +static double max_dev_ud(su3_dble *v) +{ + double d,dmax; + su3_dble *u,*um; + + u=udfld(); + um=u+4*VOLUME; + dmax=0.0; + + for (;udmax) + dmax=d; + + v+=1; + } + + if (NPROC>1) + { + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return dmax; +} + + +static double cmp_fd(su3_alg_dble *f,su3_alg_dble *g) +{ + int i; + double r[8],dev,dmax; + + r[0]=(*f).c1-(*g).c1; + r[1]=(*f).c2-(*g).c2; + r[2]=(*f).c3-(*g).c3; + r[3]=(*f).c4-(*g).c4; + r[4]=(*f).c5-(*g).c5; + r[5]=(*f).c6-(*g).c6; + r[6]=(*f).c7-(*g).c7; + r[7]=(*f).c8-(*g).c8; + + dmax=0.0; + + for (i=0;i<8;i++) + { + dev=fabs(r[i]); + if (dev>dmax) + dmax=dev; + } + + return dmax; +} + + +static double max_dev_frc(su3_alg_dble *g) +{ + double d,dmax; + su3_alg_dble *f,*fm; + mdflds_t *mdfs; + + mdfs=mdflds(); + f=(*mdfs).frc; + fm=f+4*VOLUME; + dmax=0.0; + + for (;fdmax) + dmax=d; + + g+=1; + } + + if (NPROC>1) + { + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return dmax; +} + + +static int is_zero(su3_alg_dble *X) +{ + int ie; + + ie=((*X).c1==0.0); + ie&=((*X).c2==0.0); + ie&=((*X).c3==0.0); + ie&=((*X).c4==0.0); + ie&=((*X).c5==0.0); + ie&=((*X).c6==0.0); + ie&=((*X).c7==0.0); + ie&=((*X).c8==0.0); + + return ie; +} + + +static int check_bnd_fld(su3_alg_dble *fld) +{ + int bc,npts,*pts,*ptm; + int ix,t,ifc,ie; + su3_alg_dble *f; + + bc=bc_type(); + pts=bnd_pts(&npts); + ptm=pts+npts; + pts+=(npts/2); + ie=0; + + for (;ptsdmax) + dmax=d; + } + } + } + } + } + + if (NPROC>1) + { + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return dmax; +} + + +static void scale_bnd_frc(su3_alg_dble *frc) +{ + int bc,ifc,npts,*pts,*ptm; + su3_alg_dble *fr; + + bc=bc_type(); + + if ((bc==0)||(bc==2)) + { + pts=bnd_pts(&npts); + ptm=pts+npts; + pts+=(npts/2); + + for (;pts]"); + } + + MPI_Bcast(&n,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&eps,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + + set_lat_parms(6.0,1.0,0,NULL,1.0); + print_lat_parms(); + + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + alloc_wud(1); + alloc_wfd(2); + mdfs=mdflds(); + usv=reserve_wud(1); + fsv=reserve_wfd(1); + udb=udfld(); + + if (bc==0) + nplaq=(double)(6*N0-6)*(double)(N1*N2*N3); + else + nplaq=(double)(6*N0)*(double)(N1*N2*N3); + + random_ud(); + act0=action0(1); + act1=3.0*nplaq-plaq_wsum_dble(1); + + plaq_frc(); + ie=check_bnd_fld((*mdfs).frc); + error(ie!=0,1,"main [check1.c]", + "Force vanishes on an incorrect subset of links"); + assign_alg2alg(4*VOLUME,(*mdfs).frc,fsv[0]); + force0(1.0); + ie=check_bnd_fld((*mdfs).frc); + error(ie!=0,1,"main [check1.c]", + "Force vanishes on an incorrect subset of links"); + dev0=max_dev_frc(fsv[0]); + + if (my_rank==0) + { + printf("Random gauge field:\n"); + printf("Action (action0) = %.15e\n",act0); + printf("Action (plaq_wsum) = %.15e\n",2.0*act1); + printf("Deviation of force = %.1e\n\n",dev0); + } + + random_ud(); + cm3x3_assign(4*VOLUME,udb,usv[0]); + plaq_frc(); + assign_alg2alg(4*VOLUME,(*mdfs).frc,fsv[0]); + dev0=chkfrc(); + fwd_euler(1,eps); + frc=fsv[0]; + scale_bnd_frc(frc); + u=udb; + um=u+4*VOLUME; + + for (;uact0)&&(eps>=0.0))||((act1 +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "su3fcts.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "update.h" +#include "wflow.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,nfc[8],ofs[8]; +static const su3_dble ud0={{0.0}}; +static su3_dble *g,*gbuf; +static su3_dble wd ALIGNED16; + + +static void pack_gbuf(void) +{ + int ifc,ib,ix; + + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=0; + ofs[1]=ofs[0]+nfc[0]; + ofs[2]=ofs[1]+nfc[1]; + ofs[3]=ofs[2]+nfc[2]; + ofs[4]=ofs[3]+nfc[3]; + ofs[5]=ofs[4]+nfc[4]; + ofs[6]=ofs[5]+nfc[5]; + ofs[7]=ofs[6]+nfc[6]; + + for (ifc=0;ifc<8;ifc++) + { + for (ib=0;ib0) + { + tag=mpi_tag(); + saddr=npr[ifc^0x1]; + raddr=npr[ifc]; + sbuf=gbuf+ofs[ifc]; + rbuf=g+VOLUME+ofs[ifc]; + + if (np&0x1) + { + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + } + } +} + + +static void random_g(void) +{ + int ix,t; + su3_dble unity,*gx; + + unity=ud0; + unity.c11.re=1.0; + unity.c22.re=1.0; + unity.c33.re=1.0; + gx=g; + + for (ix=0;ix0)||(bc!=1)) + random_su3_dble(gx); + else + (*gx)=unity; + + gx+=1; + } + + if (BNDRY>0) + { + pack_gbuf(); + send_gbuf(); + } +} + + +static void transform_ud(void) +{ + int ix,iy,t,ifc; + su3_dble *u; + + u=udfld(); + + for (ix=(VOLUME/2);ixdmax) + dmax=dev; + } + + return dmax; +} + + +static double max_dev_ud(su3_dble *v) +{ + double d,dmax; + su3_dble *u,*um; + + u=udfld(); + um=u+4*VOLUME; + dmax=0.0; + + for (;udmax) + dmax=d; + + v+=1; + } + + if (NPROC>1) + { + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return sqrt(dmax); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,n; + double phi[2],phi_prime[2],eps,dev; + su3_dble *udb,**usv; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + fin=freopen("check2.in","r",stdin); + + printf("\n"); + printf("Gauge covariance of the Wilson flow\n"); + printf("-----------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("n","%d\n",&n); + read_line("eps","%lf",&eps); + fclose(fin); + + printf("n = %d\n",n); + printf("eps = %.3e\n\n",eps); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check2.c]", + "Syntax: check2 [-bc ]"); + } + + MPI_Bcast(&n,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&eps,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.973,1.127,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + alloc_wud(2); + alloc_wfd(1); + usv=reserve_wud(2); + udb=udfld(); + + g=amalloc(NSPIN*sizeof(*g),4); + + if (BNDRY>0) + gbuf=amalloc((BNDRY/2)*sizeof(*gbuf),4); + + error((g==NULL)||((BNDRY>0)&&(gbuf==NULL)),1,"main [check2.c]", + "Unable to allocate auxiliary arrays"); + + random_ud(); + random_g(); + cm3x3_assign(4*VOLUME,udb,usv[0]); + fwd_euler(n,eps); + transform_ud(); + cm3x3_assign(4*VOLUME,udb,usv[1]); + cm3x3_assign(4*VOLUME,usv[0],udb); + set_flags(UPDATED_UD); + transform_ud(); + fwd_euler(n,eps); + + dev=max_dev_ud(usv[1]); + error_chk(); + + if (my_rank==0) + { + printf("Maximal absolute deviation of U(x,mu) = %.1e\n\n",dev); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check2.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check2.in new file mode 100644 index 0000000000000000000000000000000000000000..d4adf9957594ad832baf6adcf1c986cde572979c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check2.in @@ -0,0 +1,2 @@ +n 4 +eps 0.01 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..3fc0e2d6fcd95af58174cb18e2b59b2f374821cd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check3.c @@ -0,0 +1,378 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2009-2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Convergence of the numerical integration of the Wilson flow. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "archive.h" +#include "forces.h" +#include "tcharge.h" +#include "wflow.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int my_rank; +static char cnfg_dir[NAME_SIZE],cnfg_file[NAME_SIZE]; +static char nbase[NAME_SIZE],end_file[NAME_SIZE]; + + +static void cmp_ud(su3_dble *u,su3_dble *v,double *dev) +{ + int i; + double r[18],d; + + r[ 0]=(*u).c11.re-(*v).c11.re; + r[ 1]=(*u).c11.im-(*v).c11.im; + r[ 2]=(*u).c12.re-(*v).c12.re; + r[ 3]=(*u).c12.im-(*v).c12.im; + r[ 4]=(*u).c13.re-(*v).c13.re; + r[ 5]=(*u).c13.im-(*v).c13.im; + + r[ 6]=(*u).c21.re-(*v).c21.re; + r[ 7]=(*u).c21.im-(*v).c21.im; + r[ 8]=(*u).c22.re-(*v).c22.re; + r[ 9]=(*u).c22.im-(*v).c22.im; + r[10]=(*u).c23.re-(*v).c23.re; + r[11]=(*u).c23.im-(*v).c23.im; + + r[12]=(*u).c31.re-(*v).c31.re; + r[13]=(*u).c31.im-(*v).c31.im; + r[14]=(*u).c32.re-(*v).c32.re; + r[15]=(*u).c32.im-(*v).c32.im; + r[16]=(*u).c33.re-(*v).c33.re; + r[17]=(*u).c33.im-(*v).c33.im; + + dev[0]=0.0; + dev[1]=0.0; + + for (i=0;i<18;i+=2) + { + d=sqrt(r[i]*r[i]+r[i+1]*r[i+1]); + + if (d>dev[0]) + dev[0]=d; + + dev[1]+=d; + } +} + + +static void dev_ud(su3_dble *v,double *dev) +{ + double d[2]; + su3_dble *u,*um; + + u=udfld(); + um=u+4*VOLUME; + dev[0]=0.0; + dev[1]=0.0; + + for (;udev[0]) + dev[0]=d[0]; + + dev[1]+=d[1]; + v+=1; + } + + if (NPROC>1) + { + d[0]=dev[0]; + d[1]=dev[1]; + MPI_Reduce(d,dev,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Reduce(d+1,dev+1,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(dev,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + dev[1]/=((double)(9*NPROC)*(double)(4*VOLUME)); +} + + +static int check_end(void) +{ + int iend; + FILE *end; + + if (my_rank==0) + { + iend=0; + end=fopen(end_file,"r"); + + if (end!=NULL) + { + fclose(end); + remove(end_file); + iend=1; + printf("End flag set, run stopped\n\n"); + } + } + + MPI_Bcast(&iend,1,MPI_INT,0,MPI_COMM_WORLD); + + return iend; +} + + +int main(int argc,char *argv[]) +{ + int first,last,step; + int bc,n,rule,icnfg,ncnfg,nsize; + double phi[2],phi_prime[2]; + double eps,dE[3],dQ[3],dU[2]; + double act[2],qtop[2],dev[2],nplaq; + double wt1,wt2,wtavg; + su3_dble *udb,**usv; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check3.log","w",stdout); + fin=freopen("check3.in","r",stdin); + + printf("\n"); + printf("Convergence of the numerical integration of the Wilson flow\n"); + printf("-----------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + find_section("Configurations"); + read_line("name","%s",nbase); + read_line("cnfg_dir","%s",cnfg_dir); + read_line("first","%d",&first); + read_line("last","%d",&last); + read_line("step","%d",&step); + + find_section("Boundary conditions"); + read_line("type","%d\n",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + find_section("Wilson flow"); + read_line("n","%d\n",&n); + read_line("eps","%lf\n",&eps); + read_line("rule","%d",&rule); + fclose(fin); + + error_root((rule<0)||(rule>3),1,"main [check3.c]", + "rule must be 1,2 or 3"); + } + + MPI_Bcast(nbase,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(cnfg_dir,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(&first,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&last,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&step,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + MPI_Bcast(&n,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&eps,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&rule,1,MPI_INT,0,MPI_COMM_WORLD); + + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + alloc_wud(2); + alloc_wfd(1); + usv=reserve_wud(2); + udb=udfld(); + + if (my_rank==0) + { + printf("n = %d\n",n); + printf("eps = %.3e\n",eps); + + if (rule==1) + printf("Using the Euler integrator\n\n"); + else if (rule==2) + printf("Using the 2nd order RK integrator\n\n"); + else + printf("Using the 3rd order RK integrator\n\n"); + + printf("Configurations %sn%d -> %sn%d in steps of %d\n\n", + nbase,first,nbase,last,step); + + printf("Comparison of the integrated fields at fixed t=n*eps=%.2e\n", + (double)(n)*eps); + printf("with a precise integration using 5x the input value of n\n\n"); + + printf("The deviation |U_ij-U'_ij| is calculated component by\n"); + printf("component on all links of the lattice\n\n"); + fflush(flog); + } + + error_root(((last-first)%step)!=0,1,"main [check3.c]", + "last-first is not a multiple of step"); + check_dir_root(cnfg_dir); + + nsize=name_size("%s/%sn%d",cnfg_dir,nbase,last); + error_root(nsize>=NAME_SIZE,1,"main [check3.c]", + "cnfg_dir name is too long"); + + sprintf(end_file,"check3.end"); + + if (bc==0) + nplaq=(double)(6*N0-6)*(double)(N1*N2*N3); + else + nplaq=(double)(6*N0)*(double)(N1*N2*N3); + + dE[0]=0.0; + dE[1]=0.0; + dE[2]=0.0; + dQ[0]=0.0; + dQ[1]=0.0; + dQ[2]=0.0; + dU[0]=0.0; + dU[1]=0.0; + wtavg=0.0; + + for (icnfg=first;icnfg<=last;icnfg+=step) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + if (my_rank==0) + { + printf("Configuration no %d:\n\n",icnfg); + fflush(flog); + } + + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + import_cnfg(cnfg_file); + cm3x3_assign(4*VOLUME,udb,usv[0]); + + if (rule==1) + fwd_euler(10*n,eps/10.0); + else if (rule==2) + fwd_rk2(4*n,eps/4.0); + else + fwd_rk3(3*n,eps/3.0); + + cm3x3_assign(4*VOLUME,udb,usv[1]); + act[0]=2.0*(3.0*nplaq-plaq_wsum_dble(1)); + qtop[0]=tcharge(); + + cm3x3_assign(4*VOLUME,usv[0],udb); + set_flags(UPDATED_UD); + + if (rule==1) + fwd_euler(n,eps); + else if (rule==2) + fwd_rk2(n,eps); + else + fwd_rk3(n,eps); + + act[1]=2.0*(3.0*nplaq-plaq_wsum_dble(1)); + qtop[1]=tcharge(); + + dev[0]=fabs(act[1]-act[0]); + if (dev[0]>dE[0]) + dE[0]=dev[0]; + dE[1]+=dev[0]; + dE[2]+=act[0]; + + dev[0]=fabs(qtop[1]-qtop[0]); + if (dev[0]>dQ[0]) + dQ[0]=dev[0]; + dQ[1]+=dev[0]; + dQ[2]+=fabs(qtop[0]); + + dev_ud(usv[1],dev); + if (dev[0]>dU[0]) + dU[0]=dev[0]; + dU[1]+=dev[1]; + + MPI_Barrier(MPI_COMM_WORLD); + wt2=MPI_Wtime(); + wtavg+=(wt2-wt1); + + if (my_rank==0) + { + printf("dE/E = %.1e, dQ = %.1e, max|dU| = %.1e, avg|dU| = %.1e\n\n", + fabs(1.0-act[0]/act[1]),fabs(qtop[1]-qtop[0]),dev[0],dev[1]); + printf("Configuration no %d fully processed in %.2e sec ", + icnfg,wt2-wt1); + printf("(average = %.2e sec)\n\n", + wtavg/(double)((icnfg-first)/step+1)); + fflush(flog); + } + + if (check_end()) + break; + } + + error_chk(); + + ncnfg=(last-first)/step+1; + dE[1]/=(double)(ncnfg); + dE[2]/=(double)(ncnfg); + dQ[1]/=(double)(ncnfg); + dQ[2]/=(double)(ncnfg); + dU[1]/=(double)(ncnfg); + + if (my_rank==0) + { + printf("\n"); + printf("Test summary\n"); + printf("------------\n\n"); + + printf("Processed %d configurations\n\n",ncnfg); + printf("max|dE|/E = %.1e, avg|dE|/E = %.1e\n", + dE[0]/dE[2],dE[1]/dE[2]); + printf("max|dQ| = %.1e, avg|dQ| = %.1e, avg|Q| = %.2e\n", + dQ[0],dQ[1],dQ[2]); + printf("max|dU| = %.1e, avg|dU| = %.1e\n\n", + dU[0],dU[1]); + + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check3.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check3.in new file mode 100644 index 0000000000000000000000000000000000000000..824fe24eb09c2256a45a2b433dd351b5242600dc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check3.in @@ -0,0 +1,17 @@ + +[Configurations] +name 16x8x8x8b6.00id2 +cnfg_dir /home/data/openQCD/cnfg +first 7 +last 7 +step 1 + +[Boundary conditions] +type 0 +# phi 0.937 0.389 +# phi' -0.283 1.23 + +[Wilson flow] +n 100 +eps 0.02 +rule 3 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..2382083ee88e0f55ec7bf2c07e043f169ddc1c91 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/INDEX @@ -0,0 +1,31 @@ + +******************************************************************************** + + Collection of openQCD notes + +******************************************************************************** + +dirac.pdf Implementation of the lattice Dirac operator. + +forces.pdf Molecular-dynamics quark forces. + +gauge_action.pdf Gauge actions in openQCD simulations. + +mscg.pdf Multi-shift conjugate gradient algorithm. + +parms.pdf Parameters of the openQCD main programs. + +ranlux_guide.pdf User's guide for the random number generator ranlux. + Notes accompanying the program files ranlxs.c and + ranlxd.c in modules/random. + +ranlux_notes.pdf Description of the algorithms used in the current + version of the ranlux random number generator. + +rhmc.pdf Charm and strange quark in openQCD simulations. + +stat_fcts.pdf Statistical tests. Notes accompanying the program files + pchi_square.c and ks_test.c in modules/nompi/extras. + +su3_fcts.pdf SU(3) matrix functions. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/dirac.pdf b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/dirac.pdf new file mode 100644 index 0000000000000000000000000000000000000000..b045bfabfa8ad71b52edea371afb88081d01d790 Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/dirac.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/forces.pdf b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/forces.pdf new file mode 100644 index 0000000000000000000000000000000000000000..320f71054c609ced7b4257f9b42938e20abcdaee Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/forces.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/gauge_action.pdf b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/gauge_action.pdf new file mode 100644 index 0000000000000000000000000000000000000000..fad950893b8ebe50ed9f618375ecf491ea4490e8 Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/gauge_action.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/mscg.pdf b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/mscg.pdf new file mode 100644 index 0000000000000000000000000000000000000000..8fa12523959d7a9701dc65d3c30d20253d9eaca3 Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/mscg.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/parms.pdf b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/parms.pdf new file mode 100644 index 0000000000000000000000000000000000000000..0025aa7dcce1a825cca887972532015b44ce9b77 Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/parms.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/ranlux_guide.pdf b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/ranlux_guide.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e02211bfabd2a025a7ce5feb64d96937b67386d7 Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/ranlux_guide.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/ranlux_notes.pdf b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/ranlux_notes.pdf new file mode 100644 index 0000000000000000000000000000000000000000..a49a62e78293c06a33548c17b370b290f623ac3a Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/ranlux_notes.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/rhmc.pdf b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/rhmc.pdf new file mode 100644 index 0000000000000000000000000000000000000000..18f9d4051dede3cc697b6686cd84c91d8cbd90ce Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/rhmc.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/stat_fcts.pdf b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/stat_fcts.pdf new file mode 100644 index 0000000000000000000000000000000000000000..4c2174af899bd911102150c1a421dda085025c0d Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/stat_fcts.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/su3_fcts.pdf b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/su3_fcts.pdf new file mode 100644 index 0000000000000000000000000000000000000000..19e2461e9c00f0db756887f5dfa3c33eb6e6b434 Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/su3_fcts.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/archive.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/archive.h new file mode 100644 index 0000000000000000000000000000000000000000..a18a188bbd6ce57d85d9a7a07342ba37c4ee0bb8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/archive.h @@ -0,0 +1,38 @@ + +/******************************************************************************* +* +* File archive.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef ARCHIVE_H +#define ARCHIVE_H + +#ifndef SU3_H +#include "su3.h" +#endif + +/* ARCHIVE_C */ +extern void write_cnfg(char *out); +extern void read_cnfg(char *in); +extern void export_cnfg(char *out); +extern void import_cnfg(char *in); + +/* MARCHIVE_C */ +extern void write_mfld(char *out); +extern void read_mfld(char *in); +extern void export_mfld(char *out); +extern void import_mfld(char *in); + +/* SARCHIVE_C */ +extern void write_sfld(char *out,spinor_dble *sd); +extern void read_sfld(char *in,spinor_dble *sd); +extern void export_sfld(char *out,spinor_dble *sd); +extern void import_sfld(char *in,spinor_dble *sd); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/avx.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/avx.h new file mode 100644 index 0000000000000000000000000000000000000000..03d07137cd9f6009867cf7c48d76cfe3f9170126 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/avx.h @@ -0,0 +1,2754 @@ + +/******************************************************************************* +* +* File avx.h +* +* Copyright (C) 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Macros for Dirac spinors, SU(3) vectors and SU(3) matrices using inline +* assembly AVX instructions. The machine is assumed to comply with the +* x86-64 instruction set. +* +*******************************************************************************/ + +#ifndef AVX_H +#define AVX_H + +#ifndef SSE2_H +#include "sse2.h" +#endif + +typedef struct +{ + float c1,c2,c3,c4; + float c5,c6,c7,c8; +} avx_float __attribute__ ((aligned (32))); + +typedef struct +{ + double c1,c2,c3,c4; +} avx_double __attribute__ ((aligned (32))); + +static avx_double _avx_sgn12_dble __attribute__ ((unused)) ={-1.0,-1.0,1.0,1.0}; +static avx_double _avx_sgn13_dble __attribute__ ((unused)) ={-1.0,1.0,-1.0,1.0}; +static avx_double _avx_sgn14_dble __attribute__ ((unused)) ={-1.0,1.0,1.0,-1.0}; +static avx_double _avx_sgn23_dble __attribute__ ((unused)) ={1.0,-1.0,-1.0,1.0}; +static avx_double _avx_sgn24_dble __attribute__ ((unused)) ={1.0,-1.0,1.0,-1.0}; +static avx_double _avx_sgn34_dble __attribute__ ((unused)) ={1.0,1.0,-1.0,-1.0}; +static avx_double _avx_sgn_dble __attribute__ ((unused)) ={-1.0,-1.0,-1.0,-1.0}; + +static avx_float _avx_sgn_add __attribute__ ((unused)) +={1.0f,1.0f,1.0f,1.0f,-1.0f,-1.0f,-1.0f,-1.0f}; +static avx_float _avx_sgn_i_add __attribute__ ((unused)) +={-1.0f,1.0f,-1.0f,1.0f,1.0f,-1.0f,1.0f,-1.0f}; +static avx_float _avx_sgn_addsub __attribute__ ((unused)) +={1.0f,1.0f,-1.0f,-1.0f,-1.0f,-1.0f,1.0f,1.0f}; +static avx_float _avx_sgn_i_addsub __attribute__ ((unused)) +={-1.0f,1.0f,1.0f,-1.0f,1.0f,-1.0f,-1.0f,1.0f}; + +#define _avx_zeroall() \ +__asm__ __volatile__ ("vzeroall") + +#define _avx_zeroupper() \ +__asm__ __volatile__ ("vzeroupper") + +/******************************************************************************* +* +* Macros operating on single precision data +* +*******************************************************************************/ + +/******************************************************************************* +* +* Macros for spinors in su3_vector order +* +*******************************************************************************/ + +/* +* Loads two spinors sl and sh to the low and high lanes of ymm0,..,ymm5. The +* ordering of the spinor components in the low lane is +* +* xmm0 <- sl.c1.c1,sl.c2.c1 +* xmm1 <- sl.c1.c2,sl.c2.c2 +* xmm2 <- sl.c1.c3,sl.c2.c3 +* xmm3 <- sl.c3.c1,sl.c4.c1 +* xmm4 <- sl.c3.c2,sl.c4.c2 +* xmm5 <- sl.c3.c3,sl.c4.c3 +* +* and those in the high lane are arranged in the same way. The registers +* ymm6,..,ymm11 are changed on exit. +*/ + +#define _avx_spinor_pair_load34(sl,sh) \ +__asm__ __volatile__ ("vmovaps %0, %%xmm6 \n\t" \ + "vmovaps %2, %%xmm7 \n\t" \ + "vmovaps %4, %%xmm8" \ + : \ + : \ + "m" ((sl).c1.c1), \ + "m" ((sl).c1.c2), \ + "m" ((sl).c1.c3), \ + "m" ((sl).c2.c1), \ + "m" ((sl).c2.c2), \ + "m" ((sl).c2.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vmovaps %0, %%xmm9 \n\t" \ + "vmovaps %2, %%xmm10 \n\t" \ + "vmovaps %4, %%xmm11" \ + : \ + : \ + "m" ((sl).c3.c1), \ + "m" ((sl).c3.c2), \ + "m" ((sl).c3.c3), \ + "m" ((sl).c4.c1), \ + "m" ((sl).c4.c2), \ + "m" ((sl).c4.c3) \ + : \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vinsertf128 $0x1, %0, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %2, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %4, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((sh).c1.c1), \ + "m" ((sh).c1.c2), \ + "m" ((sh).c1.c3), \ + "m" ((sh).c2.c1), \ + "m" ((sh).c2.c2), \ + "m" ((sh).c2.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vinsertf128 $0x1, %0, %%ymm9, %%ymm9 \n\t" \ + "vinsertf128 $0x1, %2, %%ymm10, %%ymm10 \n\t" \ + "vinsertf128 $0x1, %4, %%ymm11, %%ymm11" \ + : \ + : \ + "m" ((sh).c3.c1), \ + "m" ((sh).c3.c2), \ + "m" ((sh).c3.c3), \ + "m" ((sh).c4.c1), \ + "m" ((sh).c4.c2), \ + "m" ((sh).c4.c3) \ + : \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vshufps $0xe4, %%ymm7, %%ymm6, %%ymm0 \n\t" \ + "vshufps $0xe4, %%ymm10, %%ymm9, %%ymm3 \n\t" \ + "vshufps $0x4e, %%ymm8, %%ymm6, %%ymm1 \n\t" \ + "vshufps $0x4e, %%ymm11, %%ymm9, %%ymm4 \n\t" \ + "vshufps $0xe4, %%ymm8, %%ymm7, %%ymm2 \n\t" \ + "vshufps $0xe4, %%ymm11, %%ymm10, %%ymm5" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Loads two spinors sl and sh to the low and high lanes of ymm0,..,ymm5. The +* ordering of the spinor components in the low lane is +* +* xmm0 <- sl.c1.c1,sl.c2.c1 +* xmm1 <- sl.c1.c2,sl.c2.c2 +* xmm2 <- sl.c1.c3,sl.c2.c3 +* xmm3 <- sl.c4.c1,sl.c3.c1 (note: unusual order) +* xmm4 <- sl.c4.c2,sl.c3.c2 +* xmm5 <- sl.c4.c3,sl.c3.c3 +* +* and those in the high lane are arranged in the same way. The registers +* ymm6,..,ymm11 are changed on exit. +*/ + +#define _avx_spinor_pair_load43(sl,sh) \ +__asm__ __volatile__ ("vmovaps %0, %%xmm6 \n\t" \ + "vmovaps %2, %%xmm7 \n\t" \ + "vmovaps %4, %%xmm8" \ + : \ + : \ + "m" ((sl).c1.c1), \ + "m" ((sl).c1.c2), \ + "m" ((sl).c1.c3), \ + "m" ((sl).c2.c1), \ + "m" ((sl).c2.c2), \ + "m" ((sl).c2.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vmovaps %0, %%xmm9 \n\t" \ + "vmovaps %2, %%xmm10 \n\t" \ + "vmovaps %4, %%xmm11" \ + : \ + : \ + "m" ((sl).c3.c1), \ + "m" ((sl).c3.c2), \ + "m" ((sl).c3.c3), \ + "m" ((sl).c4.c1), \ + "m" ((sl).c4.c2), \ + "m" ((sl).c4.c3) \ + : \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vinsertf128 $0x1, %0, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %2, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %4, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((sh).c1.c1), \ + "m" ((sh).c1.c2), \ + "m" ((sh).c1.c3), \ + "m" ((sh).c2.c1), \ + "m" ((sh).c2.c2), \ + "m" ((sh).c2.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vinsertf128 $0x1, %0, %%ymm9, %%ymm9 \n\t" \ + "vinsertf128 $0x1, %2, %%ymm10, %%ymm10 \n\t" \ + "vinsertf128 $0x1, %4, %%ymm11, %%ymm11" \ + : \ + : \ + "m" ((sh).c3.c1), \ + "m" ((sh).c3.c2), \ + "m" ((sh).c3.c3), \ + "m" ((sh).c4.c1), \ + "m" ((sh).c4.c2), \ + "m" ((sh).c4.c3) \ + : \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vshufps $0xe4, %%ymm7, %%ymm6, %%ymm0 \n\t" \ + "vshufps $0x4e, %%ymm9, %%ymm10, %%ymm3 \n\t" \ + "vshufps $0x4e, %%ymm8, %%ymm6, %%ymm1 \n\t" \ + "vshufps $0xe4, %%ymm9, %%ymm11, %%ymm4 \n\t" \ + "vshufps $0xe4, %%ymm8, %%ymm7, %%ymm2 \n\t" \ + "vshufps $0x4e, %%ymm10, %%ymm11, %%ymm5" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Loads the spinor s to xmm0,..,xmm5 in the order +* +* xmm0 <- s.c1.c1,s.c2.c1 +* xmm1 <- s.c1.c2,s.c2.c2 +* xmm2 <- s.c1.c3,s.c2.c3 +* xmm3 <- s.c3.c1,s.c4.c1 +* xmm4 <- s.c3.c2,s.c4.c2 +* xmm5 <- s.c3.c3,s.c4.c3 +* +* and duplicates these values to the upper lanes of ymm0,..ymm5. The registers +* ymm6,..,ymm11 are changed on exit. +*/ + +#define _avx_spinor_load_dup(s) \ +__asm__ __volatile__ ("vbroadcastf128 %0, %%ymm6 \n\t" \ + "vbroadcastf128 %2, %%ymm7 \n\t" \ + "vbroadcastf128 %4, %%ymm8" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vbroadcastf128 %0, %%ymm9 \n\t" \ + "vbroadcastf128 %2, %%ymm10 \n\t" \ + "vbroadcastf128 %4, %%ymm11" \ + : \ + : \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2), \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vshufps $0xe4, %%ymm7, %%ymm6, %%ymm0 \n\t" \ + "vshufps $0xe4, %%ymm10, %%ymm9, %%ymm3 \n\t" \ + "vshufps $0x4e, %%ymm8, %%ymm6, %%ymm1 \n\t" \ + "vshufps $0x4e, %%ymm11, %%ymm9, %%ymm4 \n\t" \ + "vshufps $0xe4, %%ymm8, %%ymm7, %%ymm2 \n\t" \ + "vshufps $0xe4, %%ymm11, %%ymm10, %%ymm5" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Stores the low and high lanes of ymm0,..,ymm5 to the spinors rl and rh, +* assuming the spinor components are ordered as if they were loaded with +* _avx_spinor_pair_load34(rl,rh). The registers ymm6,..,ymm11 are changed +* on exit. +*/ + +#define _avx_spinor_pair_store34(rl,rh) \ +__asm__ __volatile__ ("vshufps $0x44, %%ymm1, %%ymm0, %%ymm6 \n\t" \ + "vshufps $0x44, %%ymm4, %%ymm3, %%ymm9 \n\t" \ + "vshufps $0xe4, %%ymm0, %%ymm2, %%ymm7 \n\t" \ + "vshufps $0xe4, %%ymm3, %%ymm5, %%ymm10 \n\t" \ + "vshufps $0xee, %%ymm2, %%ymm1, %%ymm8 \n\t" \ + "vshufps $0xee, %%ymm5, %%ymm4, %%ymm11" \ + : \ + : \ + : \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vmovaps %%xmm6, %0 \n\t" \ + "vmovaps %%xmm7, %2 \n\t" \ + "vmovaps %%xmm8, %4" \ + : \ + "=m" ((rl).c1.c1), \ + "=m" ((rl).c1.c2), \ + "=m" ((rl).c1.c3), \ + "=m" ((rl).c2.c1), \ + "=m" ((rl).c2.c2), \ + "=m" ((rl).c2.c3)); \ +__asm__ __volatile__ ("vmovaps %%xmm9, %0 \n\t" \ + "vmovaps %%xmm10, %2 \n\t" \ + "vmovaps %%xmm11, %4" \ + : \ + "=m" ((rl).c3.c1), \ + "=m" ((rl).c3.c2), \ + "=m" ((rl).c3.c3), \ + "=m" ((rl).c4.c1), \ + "=m" ((rl).c4.c2), \ + "=m" ((rl).c4.c3)); \ +__asm__ __volatile__ ("vextractf128 $0x1, %%ymm6, %0 \n\t" \ + "vextractf128 $0x1, %%ymm7, %2 \n\t" \ + "vextractf128 $0x1, %%ymm8, %4" \ + : \ + "=m" ((rh).c1.c1), \ + "=m" ((rh).c1.c2), \ + "=m" ((rh).c1.c3), \ + "=m" ((rh).c2.c1), \ + "=m" ((rh).c2.c2), \ + "=m" ((rh).c2.c3)); \ +__asm__ __volatile__ ("vextractf128 $0x1, %%ymm9, %0 \n\t" \ + "vextractf128 $0x1, %%ymm10, %2 \n\t" \ + "vextractf128 $0x1, %%ymm11, %4" \ + : \ + "=m" ((rh).c3.c1), \ + "=m" ((rh).c3.c2), \ + "=m" ((rh).c3.c3), \ + "=m" ((rh).c4.c1), \ + "=m" ((rh).c4.c2), \ + "=m" ((rh).c4.c3)) + +/* +* Stores the low and high lanes of ymm0,..,ymm5 to the spinors rl and rh, +* assuming the spinor components are ordered as if they were loaded with +* _avx_spinor_pair_load43(rl,rh). The registers ymm6,..,ymm11 are changed +* on exit. +*/ + +#define _avx_spinor_pair_store43(rl,rh) \ +__asm__ __volatile__ ("vshufps $0x44, %%ymm1, %%ymm0, %%ymm6 \n\t" \ + "vshufps $0xee, %%ymm4, %%ymm3, %%ymm9 \n\t" \ + "vshufps $0xe4, %%ymm0, %%ymm2, %%ymm7 \n\t" \ + "vshufps $0x4e, %%ymm3, %%ymm5, %%ymm10 \n\t" \ + "vshufps $0xee, %%ymm2, %%ymm1, %%ymm8 \n\t" \ + "vshufps $0x44, %%ymm5, %%ymm4, %%ymm11" \ + : \ + : \ + : \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vmovaps %%xmm6, %0 \n\t" \ + "vmovaps %%xmm7, %2 \n\t" \ + "vmovaps %%xmm8, %4" \ + : \ + "=m" ((rl).c1.c1), \ + "=m" ((rl).c1.c2), \ + "=m" ((rl).c1.c3), \ + "=m" ((rl).c2.c1), \ + "=m" ((rl).c2.c2), \ + "=m" ((rl).c2.c3)); \ +__asm__ __volatile__ ("vmovaps %%xmm9, %0 \n\t" \ + "vmovaps %%xmm10, %2 \n\t" \ + "vmovaps %%xmm11, %4" \ + : \ + "=m" ((rl).c3.c1), \ + "=m" ((rl).c3.c2), \ + "=m" ((rl).c3.c3), \ + "=m" ((rl).c4.c1), \ + "=m" ((rl).c4.c2), \ + "=m" ((rl).c4.c3)); \ +__asm__ __volatile__ ("vextractf128 $0x1, %%ymm6, %0 \n\t" \ + "vextractf128 $0x1, %%ymm7, %2 \n\t" \ + "vextractf128 $0x1, %%ymm8, %4" \ + : \ + "=m" ((rh).c1.c1), \ + "=m" ((rh).c1.c2), \ + "=m" ((rh).c1.c3), \ + "=m" ((rh).c2.c1), \ + "=m" ((rh).c2.c2), \ + "=m" ((rh).c2.c3)); \ +__asm__ __volatile__ ("vextractf128 $0x1, %%ymm9, %0 \n\t" \ + "vextractf128 $0x1, %%ymm10, %2 \n\t" \ + "vextractf128 $0x1, %%ymm11, %4" \ + : \ + "=m" ((rh).c3.c1), \ + "=m" ((rh).c3.c2), \ + "=m" ((rh).c3.c3), \ + "=m" ((rh).c4.c1), \ + "=m" ((rh).c4.c2), \ + "=m" ((rh).c4.c3)) + +/* +* Loads the lower Weyl spinors of the Dirac spinors sl and sh to the low and +* high lanes of ymm0,..,ymm3. The ordering of the spinor components in the +* low lane is +* +* xmm0 <- sl.c1.c1,sl.c2.c1 +* xmm1 <- sl.c1.c2,sl.c2.c2 +* xmm2 <- sl.c1.c3,sl.c2.c3 +* +* and those in the high lane are arranged in the same way. The registers +* ymm6,..,ymm8 are changed on exit. Also applies if sl and sh are Weyl +* spinors. +*/ + +#define _avx_weyl_pair_load12(sl,sh) \ +__asm__ __volatile__ ("vmovaps %0, %%xmm6 \n\t" \ + "vmovaps %2, %%xmm7 \n\t" \ + "vmovaps %4, %%xmm8" \ + : \ + : \ + "m" ((sl).c1.c1), \ + "m" ((sl).c1.c2), \ + "m" ((sl).c1.c3), \ + "m" ((sl).c2.c1), \ + "m" ((sl).c2.c2), \ + "m" ((sl).c2.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vinsertf128 $0x1, %0, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %2, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %4, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((sh).c1.c1), \ + "m" ((sh).c1.c2), \ + "m" ((sh).c1.c3), \ + "m" ((sh).c2.c1), \ + "m" ((sh).c2.c2), \ + "m" ((sh).c2.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vshufps $0xe4, %%ymm7, %%ymm6, %%ymm0 \n\t" \ + "vshufps $0x4e, %%ymm8, %%ymm6, %%ymm1 \n\t" \ + "vshufps $0xe4, %%ymm8, %%ymm7, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Loads the upper Weyl spinors of the Dirac spinors sl and sh to the low and +* high lanes of ymm0,..,ymm3. The ordering of the spinor components in the +* low lane is +* +* xmm0 <- sl.c3.c1,sl.c4.c1 +* xmm1 <- sl.c3.c2,sl.c4.c2 +* xmm2 <- sl.c3.c3,sl.c4.c3 +* +* and those in the high lane are arranged in the same way. The registers +* ymm6,..,ymm8 are changed on exit. +*/ + +#define _avx_weyl_pair_load34(sl,sh) \ +__asm__ __volatile__ ("vmovaps %0, %%xmm6 \n\t" \ + "vmovaps %2, %%xmm7 \n\t" \ + "vmovaps %4, %%xmm8" \ + : \ + : \ + "m" ((sl).c3.c1), \ + "m" ((sl).c3.c2), \ + "m" ((sl).c3.c3), \ + "m" ((sl).c4.c1), \ + "m" ((sl).c4.c2), \ + "m" ((sl).c4.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vinsertf128 $0x1, %0, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %2, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %4, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((sh).c3.c1), \ + "m" ((sh).c3.c2), \ + "m" ((sh).c3.c3), \ + "m" ((sh).c4.c1), \ + "m" ((sh).c4.c2), \ + "m" ((sh).c4.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vshufps $0xe4, %%ymm7, %%ymm6, %%ymm0 \n\t" \ + "vshufps $0x4e, %%ymm8, %%ymm6, %%ymm1 \n\t" \ + "vshufps $0xe4, %%ymm8, %%ymm7, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Stores the low and high lanes of ymm0,..,ymm3 to the lower Weyl spinors +* of the Dirac spinors rl and rh, assuming the spinor components are ordered +* as if they were loaded with _avx_weyl_pair_load12(rl,rh). The registers +* ymm6,..,ymm8 are changed on exit. Also applies if rl and rh are Weyl +* spinors. +*/ + +#define _avx_weyl_pair_store12(rl,rh) \ +__asm__ __volatile__ ("vshufps $0x44, %%ymm1, %%ymm0, %%ymm6 \n\t" \ + "vshufps $0xe4, %%ymm0, %%ymm2, %%ymm7 \n\t" \ + "vshufps $0xee, %%ymm2, %%ymm1, %%ymm8" \ + : \ + : \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vmovaps %%xmm6, %0 \n\t" \ + "vmovaps %%xmm7, %2 \n\t" \ + "vmovaps %%xmm8, %4" \ + : \ + "=m" ((rl).c1.c1), \ + "=m" ((rl).c1.c2), \ + "=m" ((rl).c1.c3), \ + "=m" ((rl).c2.c1), \ + "=m" ((rl).c2.c2), \ + "=m" ((rl).c2.c3)); \ +__asm__ __volatile__ ("vextractf128 $0x1, %%ymm6, %0 \n\t" \ + "vextractf128 $0x1, %%ymm7, %2 \n\t" \ + "vextractf128 $0x1, %%ymm8, %4" \ + : \ + "=m" ((rh).c1.c1), \ + "=m" ((rh).c1.c2), \ + "=m" ((rh).c1.c3), \ + "=m" ((rh).c2.c1), \ + "=m" ((rh).c2.c2), \ + "=m" ((rh).c2.c3)) + +/* +* Stores the low and high lanes of ymm0,..,ymm3 to the upper Weyl spinors +* of the Dirac spinors rl and rh, assuming the spinor components are ordered +* as if they were loaded with _avx_weyl_pair_load34(rl,rh). The registers +* ymm6,..,ymm8 are changed on exit. +*/ + +#define _avx_weyl_pair_store34(rl,rh) \ +__asm__ __volatile__ ("vshufps $0x44, %%ymm1, %%ymm0, %%ymm6 \n\t" \ + "vshufps $0xe4, %%ymm0, %%ymm2, %%ymm7 \n\t" \ + "vshufps $0xee, %%ymm2, %%ymm1, %%ymm8" \ + : \ + : \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vmovaps %%xmm6, %0 \n\t" \ + "vmovaps %%xmm7, %2 \n\t" \ + "vmovaps %%xmm8, %4" \ + : \ + "=m" ((rl).c3.c1), \ + "=m" ((rl).c3.c2), \ + "=m" ((rl).c3.c3), \ + "=m" ((rl).c4.c1), \ + "=m" ((rl).c4.c2), \ + "=m" ((rl).c4.c3)); \ +__asm__ __volatile__ ("vextractf128 $0x1, %%ymm6, %0 \n\t" \ + "vextractf128 $0x1, %%ymm7, %2 \n\t" \ + "vextractf128 $0x1, %%ymm8, %4" \ + : \ + "=m" ((rh).c3.c1), \ + "=m" ((rh).c3.c2), \ + "=m" ((rh).c3.c3), \ + "=m" ((rh).c4.c1), \ + "=m" ((rh).c4.c2), \ + "=m" ((rh).c4.c3)) + +/* +* Splits the registers ymm3,..,ymm5 according to +* +* xmm3 <- ymm3_lo + ymm3_hi +* xmm4 <- ymm4_lo + ymm4_hi +* xmm5 <- ymm5_lo + ymm5_hi +* +* xmm6 <- ymm3_lo - ymm3_hi +* xmm7 <- ymm4_lo - ymm4_hi +* xmm8 <- ymm5_lo - ymm5_hi +* +* where *_lo and *_hi are the low and high lanes of the registers. The +* registers ymm9,..,ymm11 are used as workspace. +*/ + +#define _avx_spinor_split() \ +__asm__ __volatile__ ("vextractf128 $0x1, %%ymm3, %%xmm9 \n\t" \ + "vextractf128 $0x1, %%ymm4, %%xmm10 \n\t" \ + "vextractf128 $0x1, %%ymm5, %%xmm11 \n\t" \ + "vsubps %%xmm9, %%xmm3, %%xmm6 \n\t" \ + "vsubps %%xmm10, %%xmm4, %%xmm7 \n\t" \ + "vsubps %%xmm11, %%xmm5, %%xmm8 \n\t" \ + "vaddps %%xmm9, %%xmm3, %%xmm3 \n\t" \ + "vaddps %%xmm10, %%xmm4, %%xmm4 \n\t" \ + "vaddps %%xmm11, %%xmm5, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11") + +/* +* Moves the lower lanes of ymm6,..,ymm8 to the upper lanes of ymm3,..,ymm5. +*/ + +#define _avx_spinor_unsplit() \ +__asm__ __volatile__ ("vinsertf128 $0x1, %%xmm6, %%ymm3, %%ymm3 \n\t" \ + "vinsertf128 $0x1, %%xmm7, %%ymm4, %%ymm4 \n\t" \ + "vinsertf128 $0x1, %%xmm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies ymm3,..,ymm5 by the avx_float c. The register ymm15 is used as +* workspace. +*/ + +#define _avx_spinor_mul_up(c) \ +__asm__ __volatile__ ("vmovaps %0, %%ymm15 \n\t" \ + "vmulps %%ymm15, %%ymm3, %%ymm3 \n\t" \ + "vmulps %%ymm15, %%ymm4, %%ymm4 \n\t" \ + "vmulps %%ymm15, %%ymm5, %%ymm5" \ + : \ + : \ + "m" (c) \ + : \ + "xmm3", "xmm4", "xmm5", "xmm15") + +/* +* Exchanges real and imaginary parts of the double words in ymm3,..,ymm5 +* and multiplies these registers by the avx_float c. The register ymm15 is +* used as workspace. +*/ + +#define _avx_spinor_imul_up(c) \ +__asm__ __volatile__ ("vmovaps %0, %%ymm15 \n\t" \ + "vpermilps $0xb1, %%ymm3, %%ymm3 \n\t" \ + "vpermilps $0xb1, %%ymm4, %%ymm4 \n\t" \ + "vpermilps $0xb1, %%ymm5, %%ymm5 \n\t" \ + "vmulps %%ymm15, %%ymm3, %%ymm3 \n\t" \ + "vmulps %%ymm15, %%ymm4, %%ymm4 \n\t" \ + "vmulps %%ymm15, %%ymm5, %%ymm5" \ + : \ + : \ + "m" (c) \ + : \ + "xmm3", "xmm4", "xmm5", "xmm15") + +/* +* Exchanges the high and low words in the two lanes of ymm3,..,ymm5. +*/ + +#define _avx_spinor_xch_up() \ +__asm__ __volatile__ ("vpermilps $0x4e, %%ymm3, %%ymm3 \n\t" \ + "vpermilps $0x4e, %%ymm4, %%ymm4 \n\t" \ + "vpermilps $0x4e, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Exchanges the high and low words in the two lanes of ymm3,..,ymm5, then the +* real and imaginary parts of the words and finally multiplies the registers +* by the avx_float c. The register ymm15 is used as workspace. +*/ + +#define _avx_spinor_xch_imul_up(c) \ +__asm__ __volatile__ ("vmovaps %0, %%ymm15 \n\t" \ + "vpermilps $0x1b, %%ymm3, %%ymm3 \n\t" \ + "vpermilps $0x1b, %%ymm4, %%ymm4 \n\t" \ + "vpermilps $0x1b, %%ymm5, %%ymm5 \n\t" \ + "vmulps %%ymm15, %%ymm3, %%ymm3 \n\t" \ + "vmulps %%ymm15, %%ymm4, %%ymm4 \n\t" \ + "vmulps %%ymm15, %%ymm5, %%ymm5" \ + : \ + : \ + "m" (c) \ + : \ + "xmm3", "xmm4", "xmm5", "xmm15") + +/* +* Multiplies xmm6,..,xmm8 by the sse_float c. The register ymm15 is used as +* workspace. +*/ + +#define _avx_weyl_mul(c) \ +__asm__ __volatile__ ("vmovaps %0, %%xmm15 \n\t" \ + "vmulps %%xmm15, %%xmm6, %%xmm6 \n\t" \ + "vmulps %%xmm15, %%xmm7, %%xmm7 \n\t" \ + "vmulps %%xmm15, %%xmm8, %%xmm8" \ + : \ + : \ + "m" (c) \ + : \ + "xmm6", "xmm7", "xmm8", "xmm15") + +/* +* Exchanges real and imaginary parts of the double words in xmm6,..,xmm8 +* and multiplies these registers by the sse_float c. The register ymm15 is +* used as workspace. +*/ + +#define _avx_weyl_imul(c) \ +__asm__ __volatile__ ("vmovaps %0, %%xmm15 \n\t" \ + "vpermilps $0xb1, %%xmm6, %%xmm6 \n\t" \ + "vpermilps $0xb1, %%xmm7, %%xmm7 \n\t" \ + "vpermilps $0xb1, %%xmm8, %%xmm8 \n\t" \ + "vmulps %%xmm15, %%xmm6, %%xmm6 \n\t" \ + "vmulps %%xmm15, %%xmm7, %%xmm7 \n\t" \ + "vmulps %%xmm15, %%xmm8, %%xmm8" \ + : \ + : \ + "m" (c) \ + : \ + "xmm6", "xmm7", "xmm8", "xmm15") + +/* +* Exchanges the high and low words of xmm6,..,xmm8. +*/ + +#define _avx_weyl_xch() \ +__asm__ __volatile__ ("vpermilps $0x4e, %%xmm6, %%xmm6 \n\t" \ + "vpermilps $0x4e, %%xmm7, %%xmm7 \n\t" \ + "vpermilps $0x4e, %%xmm8, %%xmm8" \ + : \ + : \ + : \ + "xmm6", "xmm7", "xmm8") + +/* +* Exchanges the high and low words of xmm6,..,xmm8, then the real and +* imaginary parts of the words and finally multiplies the registers by +* the sse_float c. The register ymm15 is used as workspace. +*/ + +#define _avx_weyl_xch_imul(c) \ +__asm__ __volatile__ ("vmovaps %0, %%xmm15 \n\t" \ + "vpermilps $0x1b, %%xmm6, %%xmm6 \n\t" \ + "vpermilps $0x1b, %%xmm7, %%xmm7 \n\t" \ + "vpermilps $0x1b, %%xmm8, %%xmm8 \n\t" \ + "vmulps %%xmm15, %%xmm6, %%xmm6 \n\t" \ + "vmulps %%xmm15, %%xmm7, %%xmm7 \n\t" \ + "vmulps %%xmm15, %%xmm8, %%xmm8" \ + : \ + : \ + "m" (c) \ + : \ + "xmm6", "xmm7", "xmm8", "xmm15") + +/* +* Adds ymm3,..,ymm5 to ymm0,..,ymm2 +*/ + +#define _avx_spinor_add() \ +__asm__ __volatile__ ("vaddps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddps %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Subtracts ymm3,..,ymm5 from ymm0,..,ymm2 +*/ + +#define _avx_spinor_sub() \ +__asm__ __volatile__ ("vsubps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vsubps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vsubps %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Adds (subtracts) the low (high) words in the two lanes of ymm3,..,ymm5 +* to (from) ymm0,..,ymm2. The registers ymm6,ymm7,ymm8 are changed on exit. +*/ + +#define _avx_spinor_addsub() \ +__asm__ __volatile__ ("vaddps %%ymm3, %%ymm0, %%ymm6 \n\t" \ + "vaddps %%ymm4, %%ymm1, %%ymm7 \n\t" \ + "vaddps %%ymm5, %%ymm2, %%ymm8 \n\t" \ + "vsubps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vsubps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vsubps %%ymm5, %%ymm2, %%ymm2 \n\t" \ + "vblendps $0x33, %%ymm6, %%ymm0, %%ymm0 \n\t" \ + "vblendps $0x33, %%ymm7, %%ymm1, %%ymm1 \n\t" \ + "vblendps $0x33, %%ymm8, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm6", "xmm7", "xmm8") + +/* +* Adds (subtracts) the high (low) words in the two lanes of ymm3,..,ymm5 +* to (from) ymm0,..,ymm2. The registers ymm6,..,ymm8 are changed on exit. +*/ + +#define _avx_spinor_subadd() \ +__asm__ __volatile__ ("vaddps %%ymm3, %%ymm0, %%ymm6 \n\t" \ + "vaddps %%ymm4, %%ymm1, %%ymm7 \n\t" \ + "vaddps %%ymm5, %%ymm2, %%ymm8 \n\t" \ + "vsubps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vsubps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vsubps %%ymm5, %%ymm2, %%ymm2 \n\t" \ + "vblendps $0xcc, %%ymm6, %%ymm0, %%ymm0 \n\t" \ + "vblendps $0xcc, %%ymm7, %%ymm1, %%ymm1 \n\t" \ + "vblendps $0xcc, %%ymm8, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm6", "xmm7", "xmm8") + +/* +* Multiplies ymm3,..,ymm5 with i and adds them to ymm0,..,ymm2. The +* registers ymm3,..,ymm5 are changed on exit. +*/ + +#define _avx_spinor_i_add() \ +__asm__ __volatile__ ("vpermilps $0xb1, %%ymm3, %%ymm3 \n\t" \ + "vpermilps $0xb1, %%ymm4, %%ymm4 \n\t" \ + "vpermilps $0xb1, %%ymm5, %%ymm5 \n\t" \ + "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddsubps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddsubps %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies ymm3,..,ymm5 with i and subtracts them from ymm0,..,ymm2. +*/ + +#define _avx_spinor_i_sub() \ +__asm__ __volatile__ ("vpermilps $0xb1, %%ymm0, %%ymm0 \n\t" \ + "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0xb1, %%ymm2, %%ymm2 \n\t" \ + "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddsubps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddsubps %%ymm5, %%ymm2, %%ymm2 \n\t" \ + "vpermilps $0xb1, %%ymm0, %%ymm0 \n\t" \ + "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0xb1, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Exchanges the high and low words of ymm3,..,ymm5, multiplies them with i +* and adds the result to ymm0,..,ymm2. The registers ymm3,..,ymm5 are +* changed on exit. +*/ + +#define _avx_spinor_xch_i_add() \ +__asm__ __volatile__ ("vpermilps $0x1b, %%ymm3, %%ymm3 \n\t" \ + "vpermilps $0x1b, %%ymm4, %%ymm4 \n\t" \ + "vpermilps $0x1b, %%ymm5, %%ymm5 \n\t" \ + "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddsubps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddsubps %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Exchanges the high and low words of ymm3,..,ymm5, multiplies them with i +* and subtracts the result from ymm0,..,ymm2. +*/ + +#define _avx_spinor_xch_i_sub() \ +__asm__ __volatile__ ("vpermilps $0x1b, %%ymm0, %%ymm0 \n\t" \ + "vpermilps $0x1b, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0x1b, %%ymm2, %%ymm2 \n\t" \ + "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddsubps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddsubps %%ymm5, %%ymm2, %%ymm2 \n\t" \ + "vpermilps $0x1b, %%ymm0, %%ymm0 \n\t" \ + "vpermilps $0x1b, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0x1b, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Multiplies the low and high words in the two lanes of ymm3,..,ymm5 with +* i and -i respectively and adds these registers to ymm0,..,ymm2. The +* registers ymm3,..,ymm5 are changed on exit. +*/ + +#define _avx_spinor_i_addsub() \ +__asm__ __volatile__ ("vpermilps $0xb4, %%ymm0, %%ymm0 \n\t" \ + "vpermilps $0xb4, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0xb4, %%ymm2, %%ymm2 \n\t" \ + "vpermilps $0xe1, %%ymm3, %%ymm3 \n\t" \ + "vpermilps $0xe1, %%ymm4, %%ymm4 \n\t" \ + "vpermilps $0xe1, %%ymm5, %%ymm5 \n\t" \ + "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddsubps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddsubps %%ymm5, %%ymm2, %%ymm2 \n\t" \ + "vpermilps $0xb4, %%ymm0, %%ymm0 \n\t" \ + "vpermilps $0xb4, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0xb4, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies the low and high words in the two lanes of ymm3,..,ymm5 with +* -i and i respectively and adds these registers to ymm0,..,ymm2. The +* registers ymm3,..,ymm5 are changed on exit. +*/ + +#define _avx_spinor_i_subadd() \ +__asm__ __volatile__ ("vpermilps $0xe1, %%ymm0, %%ymm0 \n\t" \ + "vpermilps $0xe1, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0xe1, %%ymm2, %%ymm2 \n\t" \ + "vpermilps $0xb4, %%ymm3, %%ymm3 \n\t" \ + "vpermilps $0xb4, %%ymm4, %%ymm4 \n\t" \ + "vpermilps $0xb4, %%ymm5, %%ymm5 \n\t" \ + "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddsubps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddsubps %%ymm5, %%ymm2, %%ymm2 \n\t" \ + "vpermilps $0xe1, %%ymm0, %%ymm0 \n\t" \ + "vpermilps $0xe1, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0xe1, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Exchanges the high and low words in each lane of ymm3,..,ymm5. +*/ + +#define _avx_spinor_xch() \ +__asm__ __volatile__ ("vpermilps $0x4e, %%ymm3, %%ymm3 \n\t" \ + "vpermilps $0x4e, %%ymm4, %%ymm4 \n\t" \ + "vpermilps $0x4e, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + +/****************************************************************************** +* +* Action of su3 matrices on su3 vectors +* +******************************************************************************/ + +/* +* Multiplies pairs of su3 vectors, stored in the low and high lanes of +* ymm0,..,ymm2, with su3 matrices ul and uh, respectively. The vectors +* are assumed to be in vertical order and the products are returned in the +* same order in the registers ymm3,..,ymm5. All registers except for +* ymm15 are changed on exit. +*/ + +#define _avx_su3_pair_multiply(ul,uh) \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm3 \n\t" \ + "vbroadcastss %1, %%xmm6 \n\t" \ + "vbroadcastss %2, %%xmm4 \n\t" \ + "vbroadcastss %3, %%xmm9 \n\t" \ + "vbroadcastss %4, %%xmm10 \n\t" \ + "vbroadcastss %5, %%xmm11 \n\t" \ + "vinsertf128 $0x1, %%xmm9, %%ymm3, %%ymm3 \n\t" \ + "vinsertf128 $0x1, %%xmm10, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %%xmm11, %%ymm4, %%ymm4" \ + : \ + : \ + "m" ((ul).c11.re), \ + "m" ((ul).c12.re), \ + "m" ((ul).c21.re), \ + "m" ((uh).c11.re), \ + "m" ((uh).c12.re), \ + "m" ((uh).c21.re) \ + : \ + "xmm3", "xmm4", "xmm6", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm7 \n\t" \ + "vbroadcastss %1, %%xmm5 \n\t" \ + "vbroadcastss %2, %%xmm8 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm5, %%ymm5 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((ul).c22.re), \ + "m" ((ul).c31.re), \ + "m" ((ul).c32.re), \ + "m" ((uh).c22.re), \ + "m" ((uh).c31.re), \ + "m" ((uh).c32.re) \ + : \ + "xmm5", "xmm7", "xmm8", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vmulps %%ymm0, %%ymm3, %%ymm3 \n\t" \ + "vmulps %%ymm1, %%ymm6, %%ymm6 \n\t" \ + "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" \ + "vmulps %%ymm1, %%ymm7, %%ymm7 \n\t" \ + "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" \ + "vmulps %%ymm1, %%ymm8, %%ymm8 \n\t" \ + "vaddps %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddps %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddps %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm9 \n\t" \ + "vbroadcastss %1, %%xmm10 \n\t" \ + "vbroadcastss %2, %%xmm11 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm9, %%ymm9 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm10, %%ymm10 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm11, %%ymm11 \n\t" \ + "vpermilps $0xb1, %%ymm0, %%ymm0" \ + : \ + : \ + "m" ((ul).c13.re), \ + "m" ((ul).c21.im), \ + "m" ((ul).c33.re), \ + "m" ((uh).c13.re), \ + "m" ((uh).c21.im), \ + "m" ((uh).c33.re) \ + : \ + "xmm0", "xmm9", "xmm10", "xmm11", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm6 \n\t" \ + "vbroadcastss %1, %%xmm7 \n\t" \ + "vbroadcastss %2, %%xmm8 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((ul).c11.im), \ + "m" ((ul).c23.re), \ + "m" ((ul).c31.im), \ + "m" ((uh).c11.im), \ + "m" ((uh).c23.re), \ + "m" ((uh).c31.im) \ + : \ + "xmm6", "xmm7", "xmm8", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vmulps %%ymm2, %%ymm9, %%ymm9 \n\t" \ + "vmulps %%ymm0, %%ymm10, %%ymm10 \n\t" \ + "vmulps %%ymm2, %%ymm11, %%ymm11 \n\t" \ + "vmulps %%ymm0, %%ymm6, %%ymm6 \n\t" \ + "vmulps %%ymm2, %%ymm7, %%ymm7 \n\t" \ + "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" \ + "vaddps %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddps %%ymm11, %%ymm5, %%ymm5 \n\t" \ + "vaddsubps %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddps %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddsubps %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0xb1, %%ymm2, %%ymm2 \n\t" \ + "vbroadcastss %0, %%xmm12 \n\t" \ + "vbroadcastss %1, %%xmm13 \n\t" \ + "vbroadcastss %2, %%xmm14 \n\t" \ + "vbroadcastss %3, %%xmm9 \n\t" \ + "vbroadcastss %4, %%xmm10 \n\t" \ + "vbroadcastss %5, %%xmm11 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm9, %%ymm9 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm10, %%ymm10 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm11, %%ymm11" \ + : \ + : \ + "m" ((uh).c12.im), \ + "m" ((uh).c23.im), \ + "m" ((uh).c32.im), \ + "m" ((ul).c12.im), \ + "m" ((ul).c23.im), \ + "m" ((ul).c32.im) \ + : \ + "xmm1", "xmm2", "xmm9", "xmm10", \ + "xmm11", "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm6 \n\t" \ + "vbroadcastss %1, %%xmm7 \n\t" \ + "vbroadcastss %2, %%xmm8 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((ul).c13.im), \ + "m" ((ul).c22.im), \ + "m" ((ul).c33.im), \ + "m" ((uh).c13.im), \ + "m" ((uh).c22.im), \ + "m" ((uh).c33.im) \ + : \ + "xmm6", "xmm7", "xmm8", "xmm12", \ + "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vmulps %%ymm1, %%ymm9, %%ymm9 \n\t" \ + "vmulps %%ymm2, %%ymm10, %%ymm10 \n\t" \ + "vmulps %%ymm1, %%ymm11, %%ymm11 \n\t" \ + "vmulps %%ymm2, %%ymm6, %%ymm6 \n\t" \ + "vmulps %%ymm1, %%ymm7, %%ymm7 \n\t" \ + "vmulps %%ymm2, %%ymm8, %%ymm8 \n\t" \ + "vaddsubps %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddsubps %%ymm11, %%ymm5, %%ymm5 \n\t" \ + "vaddsubps %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddsubps %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11") + +/* +* Multiplies pairs of su3 vectors, stored in the low and high lanes of +* ymm0,..,ymm2, by the su3 matrices ul^dagger and uh^dagger, respectively. +* The vectors are assumed to be in vertical order and the products are returned +* in the same order in the registers ymm3,..,ymm5. All registers except for +* ymm15 are changed on exit. +*/ + +#define _avx_su3_pair_inverse_multiply(ul,uh) \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm6 \n\t" \ + "vbroadcastss %1, %%xmm9 \n\t" \ + "vbroadcastss %2, %%xmm7 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm9, %%ymm9 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm7, %%ymm7" \ + : \ + : \ + "m" ((ul).c11.im), \ + "m" ((ul).c21.im), \ + "m" ((ul).c12.im), \ + "m" ((uh).c11.im), \ + "m" ((uh).c21.im), \ + "m" ((uh).c12.im) \ + : \ + "xmm6", "xmm7", "xmm9", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm10 \n\t" \ + "vbroadcastss %1, %%xmm8 \n\t" \ + "vbroadcastss %2, %%xmm11 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm10, %%ymm10 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm8, %%ymm8 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm11, %%ymm11" \ + : \ + : \ + "m" ((ul).c22.im), \ + "m" ((ul).c13.im), \ + "m" ((ul).c23.im), \ + "m" ((uh).c22.im), \ + "m" ((uh).c13.im), \ + "m" ((uh).c23.im) \ + : \ + "xmm8", "xmm10", "xmm11", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vmulps %%ymm0, %%ymm6, %%ymm6 \n\t" \ + "vmulps %%ymm1, %%ymm9, %%ymm9 \n\t" \ + "vmulps %%ymm0, %%ymm7, %%ymm7 \n\t" \ + "vmulps %%ymm1, %%ymm10, %%ymm10 \n\t" \ + "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" \ + "vmulps %%ymm1, %%ymm11, %%ymm11 \n\t" \ + "vaddps %%ymm6, %%ymm9, %%ymm9 \n\t" \ + "vaddps %%ymm7, %%ymm10, %%ymm10 \n\t" \ + "vaddps %%ymm8, %%ymm11, %%ymm11" \ + : \ + : \ + : \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm3 \n\t" \ + "vbroadcastss %1, %%xmm4 \n\t" \ + "vbroadcastss %2, %%xmm5 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm3, %%ymm3 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm4, %%ymm4 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm5, %%ymm5 \n\t" \ + "vpermilps $0xb1, %%ymm0, %%ymm0" \ + : \ + : \ + "m" ((ul).c11.re), \ + "m" ((ul).c12.re), \ + "m" ((ul).c13.re), \ + "m" ((uh).c11.re), \ + "m" ((uh).c12.re), \ + "m" ((uh).c13.re) \ + : \ + "xmm0", "xmm3", "xmm4", "xmm5", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm6 \n\t" \ + "vbroadcastss %1, %%xmm7 \n\t" \ + "vbroadcastss %2, %%xmm8 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((ul).c31.im), \ + "m" ((ul).c32.im), \ + "m" ((ul).c33.im), \ + "m" ((uh).c31.im), \ + "m" ((uh).c32.im), \ + "m" ((uh).c33.im) \ + : \ + "xmm6", "xmm7", "xmm8", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vmulps %%ymm0, %%ymm3, %%ymm3 \n\t" \ + "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" \ + "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" \ + "vmulps %%ymm2, %%ymm6, %%ymm6 \n\t" \ + "vmulps %%ymm2, %%ymm7, %%ymm7 \n\t" \ + "vmulps %%ymm2, %%ymm8, %%ymm8 \n\t" \ + "vaddsubps %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddsubps %%ymm11, %%ymm5, %%ymm5 \n\t" \ + "vaddsubps %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddsubps %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0xb1, %%ymm2, %%ymm2 \n\t" \ + "vbroadcastss %0, %%xmm12 \n\t" \ + "vbroadcastss %1, %%xmm13 \n\t" \ + "vbroadcastss %2, %%xmm14 \n\t" \ + "vbroadcastss %3, %%xmm9 \n\t" \ + "vbroadcastss %4, %%xmm10 \n\t" \ + "vbroadcastss %5, %%xmm11 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm9, %%ymm9 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm10, %%ymm10 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm11, %%ymm11" \ + : \ + : \ + "m" ((uh).c21.re), \ + "m" ((uh).c32.re), \ + "m" ((uh).c23.re), \ + "m" ((ul).c21.re), \ + "m" ((ul).c32.re), \ + "m" ((ul).c23.re) \ + : \ + "xmm1", "xmm2", "xmm9", "xmm10", \ + "xmm11", "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm6 \n\t" \ + "vbroadcastss %1, %%xmm7 \n\t" \ + "vbroadcastss %2, %%xmm8 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((ul).c31.re), \ + "m" ((ul).c22.re), \ + "m" ((ul).c33.re), \ + "m" ((uh).c31.re), \ + "m" ((uh).c22.re), \ + "m" ((uh).c33.re) \ + : \ + "xmm6", "xmm7", "xmm8", "xmm12", \ + "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vmulps %%ymm1, %%ymm9, %%ymm9 \n\t" \ + "vmulps %%ymm2, %%ymm10, %%ymm10 \n\t" \ + "vmulps %%ymm1, %%ymm11, %%ymm11 \n\t" \ + "vmulps %%ymm2, %%ymm6, %%ymm6 \n\t" \ + "vmulps %%ymm1, %%ymm7, %%ymm7 \n\t" \ + "vmulps %%ymm2, %%ymm8, %%ymm8 \n\t" \ + "vaddps %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddps %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddps %%ymm11, %%ymm5, %%ymm5 \n\t" \ + "vaddps %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddps %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddps %%ymm8, %%ymm5, %%ymm5 \n\t" \ + "vpermilps $0xb1, %%ymm3, %%ymm3 \n\t" \ + "vpermilps $0xb1, %%ymm4, %%ymm4 \n\t" \ + "vpermilps $0xb1, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11") + + +/* +* Multiplies pairs of su3 vectors, stored in the low and high lanes of +* ymm0,..,ymm2, by the su3 matrices ul and uh^dagger, respectively. The +* vectors are assumed to be in vertical order and the products are returned +* in the same order in the registers ymm3,..,ymm5. All registers except +* for ymm15 are changed on exit. +*/ + +#define _avx_su3_pair_mixed_multiply(ul,uh) \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm3 \n\t" \ + "vbroadcastss %1, %%xmm6 \n\t" \ + "vbroadcastss %2, %%xmm4 \n\t" \ + "vbroadcastss %3, %%xmm9 \n\t" \ + "vbroadcastss %4, %%xmm10 \n\t" \ + "vbroadcastss %5, %%xmm11 \n\t" \ + "vinsertf128 $0x1, %%xmm9, %%ymm3, %%ymm3 \n\t" \ + "vinsertf128 $0x1, %%xmm10, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %%xmm11, %%ymm4, %%ymm4" \ + : \ + : \ + "m" ((ul).c11.re), \ + "m" ((ul).c12.re), \ + "m" ((ul).c21.re), \ + "m" ((uh).c11.re), \ + "m" ((uh).c21.re), \ + "m" ((uh).c12.re) \ + : \ + "xmm3", "xmm4", "xmm6", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm7 \n\t" \ + "vbroadcastss %1, %%xmm5 \n\t" \ + "vbroadcastss %2, %%xmm8 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm5, %%ymm5 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((ul).c22.re), \ + "m" ((ul).c31.re), \ + "m" ((ul).c32.re), \ + "m" ((uh).c22.re), \ + "m" ((uh).c13.re), \ + "m" ((uh).c23.re) \ + : \ + "xmm5", "xmm7", "xmm8", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vmulps %%ymm0, %%ymm3, %%ymm3 \n\t" \ + "vmulps %%ymm1, %%ymm6, %%ymm6 \n\t" \ + "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" \ + "vmulps %%ymm1, %%ymm7, %%ymm7 \n\t" \ + "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" \ + "vmulps %%ymm1, %%ymm8, %%ymm8 \n\t" \ + "vaddps %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddps %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddps %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm9 \n\t" \ + "vbroadcastss %1, %%xmm10 \n\t" \ + "vbroadcastss %2, %%xmm11 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm9, %%ymm9 \n\t" \ + "vperm2f128 $0x1, %%ymm13, %%ymm13, %%ymm13 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm11, %%ymm11 \n\t" \ + "vsubps %%ymm13, %%ymm10, %%ymm10 \n\t" \ + "vpermilps $0xb1, %%ymm0, %%ymm0" \ + : \ + : \ + "m" ((ul).c13.re), \ + "m" ((ul).c21.im), \ + "m" ((ul).c33.re), \ + "m" ((uh).c31.re), \ + "m" ((uh).c12.im), \ + "m" ((uh).c33.re) \ + : \ + "xmm0", "xmm9", "xmm10", "xmm11", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm6 \n\t" \ + "vbroadcastss %1, %%xmm7 \n\t" \ + "vbroadcastss %2, %%xmm8 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vperm2f128 $0x1, %%ymm12, %%ymm12, %%ymm12 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm7, %%ymm7 \n\t" \ + "vperm2f128 $0x1, %%ymm14, %%ymm14, %%ymm14 \n\t" \ + "vsubps %%ymm12, %%ymm6, %%ymm6 \n\t" \ + "vsubps %%ymm14, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((ul).c11.im), \ + "m" ((ul).c23.re), \ + "m" ((ul).c31.im), \ + "m" ((uh).c11.im), \ + "m" ((uh).c32.re), \ + "m" ((uh).c13.im) \ + : \ + "xmm6", "xmm7", "xmm8", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vmulps %%ymm2, %%ymm9, %%ymm9 \n\t" \ + "vmulps %%ymm0, %%ymm10, %%ymm10 \n\t" \ + "vmulps %%ymm2, %%ymm11, %%ymm11 \n\t" \ + "vmulps %%ymm0, %%ymm6, %%ymm6 \n\t" \ + "vmulps %%ymm2, %%ymm7, %%ymm7 \n\t" \ + "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" \ + "vaddps %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddps %%ymm11, %%ymm5, %%ymm5 \n\t" \ + "vaddsubps %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddps %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddsubps %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0xb1, %%ymm2, %%ymm2 \n\t" \ + "vbroadcastss %0, %%xmm12 \n\t" \ + "vbroadcastss %1, %%xmm13 \n\t" \ + "vbroadcastss %2, %%xmm14 \n\t" \ + "vbroadcastss %3, %%xmm9 \n\t" \ + "vbroadcastss %4, %%xmm10 \n\t" \ + "vbroadcastss %5, %%xmm11 \n\t" \ + "vperm2f128 $0x1, %%ymm12, %%ymm12, %%ymm12 \n\t" \ + "vperm2f128 $0x1, %%ymm13, %%ymm13, %%ymm13 \n\t" \ + "vperm2f128 $0x1, %%ymm14, %%ymm14, %%ymm14 \n\t" \ + "vsubps %%ymm12, %%ymm9, %%ymm9 \n\t" \ + "vsubps %%ymm13, %%ymm10, %%ymm10 \n\t" \ + "vsubps %%ymm14, %%ymm11, %%ymm11" \ + : \ + : \ + "m" ((uh).c21.im), \ + "m" ((uh).c32.im), \ + "m" ((uh).c23.im), \ + "m" ((ul).c12.im), \ + "m" ((ul).c23.im), \ + "m" ((ul).c32.im) \ + : \ + "xmm1", "xmm2", "xmm9", "xmm10", \ + "xmm11", "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm6 \n\t" \ + "vbroadcastss %1, %%xmm7 \n\t" \ + "vbroadcastss %2, %%xmm8 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vperm2f128 $0x1, %%ymm12, %%ymm12, %%ymm12 \n\t" \ + "vperm2f128 $0x1, %%ymm13, %%ymm13, %%ymm13 \n\t" \ + "vperm2f128 $0x1, %%ymm14, %%ymm14, %%ymm14 \n\t" \ + "vsubps %%ymm12, %%ymm6, %%ymm6 \n\t" \ + "vsubps %%ymm13, %%ymm7, %%ymm7 \n\t" \ + "vsubps %%ymm14, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((ul).c13.im), \ + "m" ((ul).c22.im), \ + "m" ((ul).c33.im), \ + "m" ((uh).c31.im), \ + "m" ((uh).c22.im), \ + "m" ((uh).c33.im) \ + : \ + "xmm6", "xmm7", "xmm8", "xmm12", \ + "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vmulps %%ymm1, %%ymm9, %%ymm9 \n\t" \ + "vmulps %%ymm2, %%ymm10, %%ymm10 \n\t" \ + "vmulps %%ymm1, %%ymm11, %%ymm11 \n\t" \ + "vmulps %%ymm2, %%ymm6, %%ymm6 \n\t" \ + "vmulps %%ymm1, %%ymm7, %%ymm7 \n\t" \ + "vmulps %%ymm2, %%ymm8, %%ymm8 \n\t" \ + "vaddsubps %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddsubps %%ymm11, %%ymm5, %%ymm5 \n\t" \ + "vaddsubps %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddsubps %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11") + +/****************************************************************************** +* +* Macros for single precision Dirac spinors in linear order +* +******************************************************************************/ + +/* +* Loads the spinor s to the registers ymm0,..,ymm2 in linear order. +*/ + +#define _avx_spinor_load(s) \ +__asm__ __volatile__ ("vmovaps %0, %%ymm0" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1) \ + : \ + "xmm0"); \ +__asm__ __volatile__ ("vmovaps %0, %%ymm1" \ + : \ + : \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3), \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2) \ + : \ + "xmm1"); \ +__asm__ __volatile__ ("vmovaps %0, %%ymm2" \ + : \ + : \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm2") + +/* +* Loads the spinor s to the registers ymm3,..,ymm5 in linear order. +*/ + +#define _avx_spinor_load_up(s) \ +__asm__ __volatile__ ("vmovaps %0, %%ymm3" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1) \ + : \ + "xmm3"); \ +__asm__ __volatile__ ("vmovaps %0, %%ymm4" \ + : \ + : \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3), \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2) \ + : \ + "xmm4"); \ +__asm__ __volatile__ ("vmovaps %0, %%ymm5" \ + : \ + : \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm5") + +/* +* Stores the registers ymm0,..,ymm2 to the spinor s in linear order. +*/ + +#define _avx_spinor_store(s) \ +__asm__ __volatile__ ("vmovaps %%ymm0, %0 \n\t" \ + : \ + "=m" ((s).c1.c1), \ + "=m" ((s).c1.c2), \ + "=m" ((s).c1.c3), \ + "=m" ((s).c2.c1)); \ +__asm__ __volatile__ ("vmovaps %%ymm1, %0 \n\t" \ + : \ + "=m" ((s).c2.c2), \ + "=m" ((s).c2.c3), \ + "=m" ((s).c3.c1), \ + "=m" ((s).c3.c2)); \ +__asm__ __volatile__ ("vmovaps %%ymm2, %0 \n\t" \ + : \ + "=m" ((s).c3.c3), \ + "=m" ((s).c4.c1), \ + "=m" ((s).c4.c2), \ + "=m" ((s).c4.c3)) + +/* +* Stores the registers ymm3,..,ymm5 to the spinor s in linear order. +*/ + +#define _avx_spinor_store_up(s) \ +__asm__ __volatile__ ("vmovaps %%ymm3, %0 \n\t" \ + : \ + "=m" ((s).c1.c1), \ + "=m" ((s).c1.c2), \ + "=m" ((s).c1.c3), \ + "=m" ((s).c2.c1)); \ +__asm__ __volatile__ ("vmovaps %%ymm4, %0 \n\t" \ + : \ + "=m" ((s).c2.c2), \ + "=m" ((s).c2.c3), \ + "=m" ((s).c3.c1), \ + "=m" ((s).c3.c2)); \ +__asm__ __volatile__ ("vmovaps %%ymm5, %0 \n\t" \ + : \ + "=m" ((s).c3.c3), \ + "=m" ((s).c4.c1), \ + "=m" ((s).c4.c2), \ + "=m" ((s).c4.c3)) + +/* +* Loads (z.re,z.re,..,z.re) to ymm12 and (-z.im,z.im,..,z.im) to ymm13. +*/ + +#define _avx_load_cmplx(z) \ +__asm__ __volatile__ ("vxorps %%ymm13, %%ymm13, %%ymm13 \n\t" \ + "vbroadcastss %0, %%ymm12 \n\t" \ + "vaddsubps %%ymm12, %%ymm13, %%ymm13 \n\t" \ + "vbroadcastss %1, %%ymm12" \ + : \ + : \ + "m" ((z).im), \ + "m" ((z).re) \ + : \ + "xmm12", "xmm13") + +/* +* Loads (z.re,z.re,..,z.re) to ymm14 and (-z.im,z.im,..,z.im) to ymm15 +*/ + +#define _avx_load_cmplx_up(z) \ +__asm__ __volatile__ ("vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" \ + "vbroadcastss %0, %%ymm14 \n\t" \ + "vaddsubps %%ymm14, %%ymm15, %%ymm15 \n\t" \ + "vbroadcastss %1, %%ymm14" \ + : \ + : \ + "m" ((z).im), \ + "m" ((z).re) \ + : \ + "xmm14", "xmm15") + +/* +* Multiplies the spinor s by the complex number z and assigns the result to +* ymm0,..,ymm2, assuming z was loaded using _avx_load_cmplx(z). The registers +* ymm3,..,ymm5 are used as workspace. +*/ + +#define _avx_mulc_spinor(s) \ +_avx_spinor_load(s); \ +__asm__ __volatile__ ("vpermilps $0xb1, %%ymm0, %%ymm3 \n\t" \ + "vpermilps $0xb1, %%ymm1, %%ymm4 \n\t" \ + "vpermilps $0xb1, %%ymm2, %%ymm5 \n\t" \ + "vmulps %%ymm12, %%ymm0, %%ymm0 \n\t" \ + "vmulps %%ymm13, %%ymm3, %%ymm3 \n\t" \ + "vmulps %%ymm12, %%ymm1, %%ymm1 \n\t" \ + "vmulps %%ymm13, %%ymm4, %%ymm4 \n\t" \ + "vmulps %%ymm12, %%ymm2, %%ymm2 \n\t" \ + "vmulps %%ymm13, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5"); \ +__asm__ __volatile__ ("vaddps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddps %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Multiplies the spinor s by the complex number z and adds the result to +* ymm0,..,ymm2, assuming z was loaded using _avx_load_cmplx_up(z). The +* registers ymm3,..,ymm8 are used as workspace. +*/ + +#define _avx_mulc_spinor_add(s) \ +_avx_spinor_load_up(s); \ +__asm__ __volatile__ ("vpermilps $0xb1, %%ymm3, %%ymm6 \n\t" \ + "vpermilps $0xb1, %%ymm4, %%ymm7 \n\t" \ + "vpermilps $0xb1, %%ymm5, %%ymm8 \n\t" \ + "vmulps %%ymm14, %%ymm3, %%ymm3 \n\t" \ + "vmulps %%ymm15, %%ymm6, %%ymm6 \n\t" \ + "vmulps %%ymm14, %%ymm4, %%ymm4 \n\t" \ + "vmulps %%ymm15, %%ymm7, %%ymm7 \n\t" \ + "vmulps %%ymm14, %%ymm5, %%ymm5 \n\t" \ + "vmulps %%ymm15, %%ymm8, %%ymm8 \n\t" \ + "vaddps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddps %%ymm5, %%ymm2, %%ymm2 \n\t" \ + "vaddps %%ymm6, %%ymm0, %%ymm0 \n\t" \ + "vaddps %%ymm7, %%ymm1, %%ymm1 \n\t" \ + "vaddps %%ymm8, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8") + +/* +* Loads (c,c,..,c) to ymm12 and ymm13. +*/ + +#define _avx_load_real(c) \ +__asm__ __volatile__ ("vbroadcastss %0, %%ymm12 \n\t" \ + "vbroadcastss %0, %%ymm13" \ + : \ + : \ + "m" (c) \ + : \ + "xmm12", "xmm13") + +/* +* Loads (c,c,..,c) to ymm14 and ymm15. +*/ + +#define _avx_load_real_up(c) \ +__asm__ __volatile__ ("vbroadcastss %0, %%ymm14 \n\t" \ + "vbroadcastss %0, %%ymm15" \ + : \ + : \ + "m" (c) \ + : \ + "xmm14", "xmm15") + +/* +* Multiplies the spinor s by the real number c and assigns the result to +* ymm0,..,ymm2, assuming c was loaded using _avx_load_real(c). +*/ + +#define _avx_mulr_spinor(s) \ +_avx_spinor_load(s); \ +__asm__ __volatile__ ("vmulps %%ymm12, %%ymm0, %%ymm0 \n\t" \ + "vmulps %%ymm13, %%ymm1, %%ymm1 \n\t" \ + "vmulps %%ymm12, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Multiplies the spinor s by the real number c and adds the result to +* ymm0,..,ymm2, assuming c was loaded using _avx_load_real_up(c). The +* registers ymm3,..,ymm5 are used as workspace. +*/ + +#define _avx_mulr_spinor_add(s) \ +_avx_spinor_load_up(s); \ +__asm__ __volatile__ ("vmulps %%ymm14, %%ymm3, %%ymm3 \n\t" \ + "vmulps %%ymm15, %%ymm4, %%ymm4 \n\t" \ + "vmulps %%ymm14, %%ymm5, %%ymm5 \n\t" \ + "vaddps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddps %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/******************************************************************************* +* +* Macros operating on double precision data +* +*******************************************************************************/ + +/******************************************************************************* +* +* Macros for su3_vector data +* +* Most of these macros operate on pairs of su3 vectors that are stored +* in the low and high lanes of ymm0,..,ymm2 or ymm3,..,ymm5. For example, +* +* ymm0 <- sl.c1.re,sl.c1.im,sh.c1.re,sh.c1.im +* ymm1 <- sl.c2.re,sl.c2.im,sh.c2.re,sh.c2.im +* ymm2 <- sl.c3.re,sl.c3.im,sh.c3.re,sh.c3.im +* +* (where sl and sh are of type su3_vector). +* +*******************************************************************************/ + +/* +* Loads two su3 vectors sl and sh to the low and high lanes of ymm0,..,ymm2. +*/ + +#define _avx_pair_load_dble(sl,sh) \ +__asm__ __volatile__ ("vmovapd %0, %%xmm0 \n\t" \ + "vmovapd %1, %%xmm1 \n\t" \ + "vmovapd %2, %%xmm2 \n\t" \ + "vinsertf128 $0x1, %3, %%ymm0, %%ymm0 \n\t" \ + "vinsertf128 $0x1, %4, %%ymm1, %%ymm1 \n\t" \ + "vinsertf128 $0x1, %5, %%ymm2, %%ymm2" \ + : \ + : \ + "m" ((sl).c1), \ + "m" ((sl).c2), \ + "m" ((sl).c3), \ + "m" ((sh).c1), \ + "m" ((sh).c2), \ + "m" ((sh).c3) \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Loads two su3 vectors sl and sh to the low and high lanes of ymm3,..,ymm5. +*/ + +#define _avx_pair_load_up_dble(sl,sh) \ +__asm__ __volatile__ ("vmovapd %0, %%xmm3 \n\t" \ + "vmovapd %1, %%xmm4 \n\t" \ + "vmovapd %2, %%xmm5 \n\t" \ + "vinsertf128 $0x1, %3, %%ymm3, %%ymm3 \n\t" \ + "vinsertf128 $0x1, %4, %%ymm4, %%ymm4 \n\t" \ + "vinsertf128 $0x1, %5, %%ymm5, %%ymm5" \ + : \ + : \ + "m" ((sl).c1), \ + "m" ((sl).c2), \ + "m" ((sl).c3), \ + "m" ((sh).c1), \ + "m" ((sh).c2), \ + "m" ((sh).c3) \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Stores the low and high lanes of ymm0,..,ymm2 to the su3 vectors rl and rh. +*/ + +#define _avx_pair_store_dble(rl,rh) \ +__asm__ __volatile__ ("vmovapd %%xmm0, %0 \n\t" \ + "vmovapd %%xmm1, %1 \n\t" \ + "vmovapd %%xmm2, %2 \n\t" \ + "vextractf128 $0x1, %%ymm0, %3 \n\t" \ + "vextractf128 $0x1, %%ymm1, %4 \n\t" \ + "vextractf128 $0x1, %%ymm2, %5" \ + : \ + "=m" ((rl).c1), \ + "=m" ((rl).c2), \ + "=m" ((rl).c3), \ + "=m" ((rh).c1), \ + "=m" ((rh).c2), \ + "=m" ((rh).c3)) + +/* +* Stores the low and high lanes of ymm3,..,ymm5 to the su3 vectors rl and rh. +*/ + +#define _avx_pair_store_up_dble(rl,rh) \ +__asm__ __volatile__ ("vmovapd %%xmm3, %0 \n\t" \ + "vmovapd %%xmm4, %1 \n\t" \ + "vmovapd %%xmm5, %2 \n\t" \ + "vextractf128 $0x1, %%ymm3, %3 \n\t" \ + "vextractf128 $0x1, %%ymm4, %4 \n\t" \ + "vextractf128 $0x1, %%ymm5, %5" \ + : \ + "=m" ((rl).c1), \ + "=m" ((rl).c2), \ + "=m" ((rl).c3), \ + "=m" ((rh).c1), \ + "=m" ((rh).c2), \ + "=m" ((rh).c3)) + +/* +* Loads the components of a Weyl spinor s to ymm0,..,ymm2 in linear order. +*/ + +#define _avx_weyl_load_dble(s) \ +__asm__ __volatile__ ("vmovapd %0, %%ymm0 \n\t" \ + "vmovapd %2, %%ymm1 \n\t" \ + "vmovapd %4, %%ymm2" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Loads the components of a Weyl spinor s to ymm3,..,ymm5 in linear order. +*/ + +#define _avx_weyl_load_up_dble(s) \ +__asm__ __volatile__ ("vmovapd %0, %%ymm3 \n\t" \ + "vmovapd %2, %%ymm4 \n\t" \ + "vmovapd %4, %%ymm5" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Stores ymm0,..,ymm2 to the components of a Weyl spinor s in linear order. +*/ + +#define _avx_weyl_store_dble(s) \ +__asm__ __volatile__ ("vmovapd %%ymm0, %0 \n\t" \ + "vmovapd %%ymm1, %2 \n\t" \ + "vmovapd %%ymm2, %4" \ + : \ + "=m" ((s).c1.c1), \ + "=m" ((s).c1.c2), \ + "=m" ((s).c1.c3), \ + "=m" ((s).c2.c1), \ + "=m" ((s).c2.c2), \ + "=m" ((s).c2.c3)) + +/* +* Stores ymm3,..,ymm5 to the components of a Weyl spinor s in linear order. +*/ + +#define _avx_weyl_store_up_dble(s) \ +__asm__ __volatile__ ("vmovapd %%ymm3, %0 \n\t" \ + "vmovapd %%ymm4, %2 \n\t" \ + "vmovapd %%ymm5, %4" \ + : \ + "=m" ((s).c1.c1), \ + "=m" ((s).c1.c2), \ + "=m" ((s).c1.c3), \ + "=m" ((s).c2.c1), \ + "=m" ((s).c2.c2), \ + "=m" ((s).c2.c3)) + +/* +* Adds ymm3,..,ymm5 to ymm0,..,ymm2. +*/ + +#define _avx_vector_add_dble() \ +__asm__ __volatile__ ("vaddpd %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Subtracts ymm3,..,ymm5 from ymm0,..,ymm2. +*/ + +#define _avx_vector_sub_dble() \ +__asm__ __volatile__ ("vsubpd %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vsubpd %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vsubpd %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Multiplies the high lanes of ymm3,..,ymm5 by -1 and adds these registers +* to ymm0,..,ymm2. +*/ + +#define _avx_vector_addsub_dble() \ +__asm__ __volatile__ ("vmulpd %0, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %0, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %0, %%ymm5, %%ymm5 \n\t" \ + "vaddpd %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + "m" (_avx_sgn34_dble) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies the low lanes of ymm3,..,ymm5 by -1 and adds these registers +* to ymm0,..,ymm2. +*/ + +#define _avx_vector_subadd_dble() \ +__asm__ __volatile__ ("vmulpd %0, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %0, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %0, %%ymm5, %%ymm5 \n\t" \ + "vaddpd %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + "m" (_avx_sgn12_dble) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies the registers ymm3,..,ymm5 by i and adds them to ymm0,..,ymm2. +*/ + +#define _avx_vector_i_add_dble() \ +__asm__ __volatile__ ("vpermilpd $0x5, %%ymm3, %%ymm3 \n\t" \ + "vpermilpd $0x5, %%ymm4, %%ymm4 \n\t" \ + "vpermilpd $0x5, %%ymm5, %%ymm5 \n\t" \ + "vaddsubpd %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddsubpd %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddsubpd %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies the registers ymm3,..,ymm5 by i and subtracts them from +* ymm0,..,ymm2. +*/ + +#define _avx_vector_i_sub_dble() \ +__asm__ __volatile__ ("vpermilpd $0x5, %%ymm3, %%ymm3 \n\t" \ + "vpermilpd $0x5, %%ymm4, %%ymm4 \n\t" \ + "vpermilpd $0x5, %%ymm5, %%ymm5 \n\t" \ + "vmulpd %0, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %0, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %0, %%ymm5, %%ymm5 \n\t" \ + "vaddpd %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + "m" (_avx_sgn24_dble) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Exchanges the high and low lanes of ymm3,..,ymm5, multiplies them by i +* and adds the result to ymm0,..,ymm2. +*/ + +#define _avx_vector_xch_i_add_dble() \ +__asm__ __volatile__ ("vpermilpd $0x5, %%ymm3, %%ymm3 \n\t" \ + "vpermilpd $0x5, %%ymm4, %%ymm4 \n\t" \ + "vpermilpd $0x5, %%ymm5, %%ymm5 \n\t" \ + "vperm2f128 $0x1, %%ymm3, %%ymm3, %%ymm3 \n\t" \ + "vperm2f128 $0x1, %%ymm4, %%ymm4, %%ymm4 \n\t" \ + "vperm2f128 $0x1, %%ymm5, %%ymm5, %%ymm5 \n\t" \ + "vaddsubpd %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddsubpd %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddsubpd %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Exchanges the high and low lanes of ymm3,..,ymm5, multiplies them by i +* and subtracts the result from ymm0,..,ymm2. +*/ + +#define _avx_vector_xch_i_sub_dble() \ +__asm__ __volatile__ ("vpermilpd $0x5, %%ymm3, %%ymm3 \n\t" \ + "vpermilpd $0x5, %%ymm4, %%ymm4 \n\t" \ + "vpermilpd $0x5, %%ymm5, %%ymm5 \n\t" \ + "vperm2f128 $0x1, %%ymm3, %%ymm3, %%ymm3 \n\t" \ + "vperm2f128 $0x1, %%ymm4, %%ymm4, %%ymm4 \n\t" \ + "vperm2f128 $0x1, %%ymm5, %%ymm5, %%ymm5 \n\t" \ + "vmulpd %0, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %0, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %0, %%ymm5, %%ymm5 \n\t" \ + "vaddpd %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + "m" (_avx_sgn24_dble) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies the low and high lanes of ymm3,..,ymm5 by i and -i +* respectively and adds these registers to ymm0,..,ymm2. +*/ + +#define _avx_vector_i_addsub_dble() \ +__asm__ __volatile__ ("vpermilpd $0x5, %%ymm3, %%ymm3 \n\t" \ + "vpermilpd $0x5, %%ymm4, %%ymm4 \n\t" \ + "vpermilpd $0x5, %%ymm5, %%ymm5 \n\t" \ + "vmulpd %0, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %0, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %0, %%ymm5, %%ymm5 \n\t" \ + "vaddpd %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + "m" (_avx_sgn14_dble) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies the low and high words of ymm3,..,ymm5 by -i and i +* respectively and adds these registers to ymm0,..,ymm2. +*/ + +#define _avx_vector_i_subadd_dble() \ +__asm__ __volatile__ ("vpermilpd $0x5, %%ymm3, %%ymm3 \n\t" \ + "vpermilpd $0x5, %%ymm4, %%ymm4 \n\t" \ + "vpermilpd $0x5, %%ymm5, %%ymm5 \n\t" \ + "vmulpd %0, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %0, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %0, %%ymm5, %%ymm5 \n\t" \ + "vaddpd %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + "m" (_avx_sgn23_dble) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Exchanges the high and low lanes of ymm3,..,ymm5. +*/ + +#define _avx_vector_xch_dble() \ +__asm__ __volatile__ ("vperm2f128 $0x1, %%ymm3, %%ymm3, %%ymm3 \n\t" \ + "vperm2f128 $0x1, %%ymm4, %%ymm4, %%ymm4 \n\t" \ + "vperm2f128 $0x1, %%ymm5, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + +/****************************************************************************** +* +* Action of su3 matrices on su3 vectors +* +******************************************************************************/ + +/* +* Multiplies a pair sl,sh of su3 vectors by an su3 matrix u, assuming sl and +* sh are in the low and high lanes of ymm0,..,ymm2. On output the result is +* in ymm3,..,ymm5 and all registers except for ymm12,..,ymm15 are changed. +*/ + +#define _avx_su3_multiply_pair_dble(u) \ +__asm__ __volatile__ ("vbroadcastsd %0, %%ymm3 \n\t" \ + "vbroadcastsd %1, %%ymm6 \n\t" \ + "vbroadcastsd %2, %%ymm4 \n\t" \ + "vbroadcastsd %3, %%ymm7 \n\t" \ + "vbroadcastsd %4, %%ymm5 \n\t" \ + "vbroadcastsd %5, %%ymm8" \ + : \ + : \ + "m" ((u).c11.re), \ + "m" ((u).c12.re), \ + "m" ((u).c21.re), \ + "m" ((u).c22.re), \ + "m" ((u).c31.re), \ + "m" ((u).c32.re) \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vmulpd %%ymm0, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %%ymm1, %%ymm6, %%ymm6 \n\t" \ + "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %%ymm1, %%ymm7, %%ymm7 \n\t" \ + "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" \ + "vmulpd %%ymm1, %%ymm8, %%ymm8 \n\t" \ + "vaddpd %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddpd %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddpd %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vbroadcastsd %0, %%ymm9 \n\t" \ + "vbroadcastsd %1, %%ymm10 \n\t" \ + "vbroadcastsd %2, %%ymm11 \n\t" \ + "vbroadcastsd %3, %%ymm6 \n\t" \ + "vbroadcastsd %4, %%ymm7 \n\t" \ + "vbroadcastsd %5, %%ymm8 \n\t" \ + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" \ + : \ + : \ + "m" ((u).c13.re), \ + "m" ((u).c21.im), \ + "m" ((u).c33.re), \ + "m" ((u).c11.im), \ + "m" ((u).c23.re), \ + "m" ((u).c31.im) \ + : \ + "xmm0", "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vmulpd %%ymm2, %%ymm9, %%ymm9 \n\t" \ + "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" \ + "vmulpd %%ymm2, %%ymm11, %%ymm11 \n\t" \ + "vmulpd %%ymm0, %%ymm6, %%ymm6 \n\t" \ + "vmulpd %%ymm2, %%ymm7, %%ymm7 \n\t" \ + "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" \ + "vaddpd %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddsubpd %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddpd %%ymm11, %%ymm5, %%ymm5 \n\t" \ + "vaddsubpd %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddpd %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddsubpd %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vbroadcastsd %0, %%ymm9 \n\t" \ + "vbroadcastsd %1, %%ymm10 \n\t" \ + "vbroadcastsd %2, %%ymm11 \n\t" \ + "vbroadcastsd %3, %%ymm6 \n\t" \ + "vbroadcastsd %4, %%ymm7 \n\t" \ + "vbroadcastsd %5, %%ymm8 \n\t" \ + "vpermilpd $0x5, %%ymm1, %%ymm1 \n\t" \ + "vpermilpd $0x5, %%ymm2, %%ymm2" \ + : \ + : \ + "m" ((u).c12.im), \ + "m" ((u).c23.im), \ + "m" ((u).c32.im), \ + "m" ((u).c13.im), \ + "m" ((u).c22.im), \ + "m" ((u).c33.im) \ + : \ + "xmm1", "xmm2", "xmm6", "xmm7", \ + "xmm8", "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vmulpd %%ymm1, %%ymm9, %%ymm9 \n\t" \ + "vmulpd %%ymm2, %%ymm10, %%ymm10 \n\t" \ + "vmulpd %%ymm1, %%ymm11, %%ymm11 \n\t" \ + "vmulpd %%ymm2, %%ymm6, %%ymm6 \n\t" \ + "vmulpd %%ymm1, %%ymm7, %%ymm7 \n\t" \ + "vmulpd %%ymm2, %%ymm8, %%ymm8 \n\t" \ + "vaddsubpd %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddsubpd %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddsubpd %%ymm11, %%ymm5, %%ymm5 \n\t" \ + "vaddsubpd %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddsubpd %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddsubpd %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11") + +/* +* Multiplies a pair sl,sh of su3 vectors by an su3 matrix u^dagger, assuming +* sl and sh are in the low and high lanes of ymm0,..,ymm2. On output the +* result is in ymm3,..,ymm5 and all registers are changed. +*/ + +#define _avx_su3_inverse_multiply_pair_dble(u) \ +__asm__ __volatile__ ("vbroadcastsd %0, %%ymm3 \n\t" \ + "vbroadcastsd %1, %%ymm6 \n\t" \ + "vbroadcastsd %2, %%ymm4 \n\t" \ + "vbroadcastsd %3, %%ymm7 \n\t" \ + "vbroadcastsd %4, %%ymm5 \n\t" \ + "vbroadcastsd %5, %%ymm8 \n\t" \ + "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" \ + "vmulpd %%ymm0, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %%ymm1, %%ymm6, %%ymm6 \n\t" \ + "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %%ymm1, %%ymm7, %%ymm7 \n\t" \ + "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" \ + "vmulpd %%ymm1, %%ymm8, %%ymm8 \n\t" \ + "vaddpd %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vsubpd %%ymm0, %%ymm15, %%ymm0 \n\t" \ + "vaddpd %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddpd %%ymm8, %%ymm5, %%ymm5 \n\t" \ + "vpermilpd $0x5, %%ymm0, %%ymm0" \ + : \ + : \ + "m" ((u).c11.re), \ + "m" ((u).c21.re), \ + "m" ((u).c12.re), \ + "m" ((u).c22.re), \ + "m" ((u).c13.re), \ + "m" ((u).c23.re) \ + : \ + "xmm0", "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", "xmm15"); \ +__asm__ __volatile__ ("vbroadcastsd %0, %%ymm9 \n\t" \ + "vbroadcastsd %1, %%ymm10 \n\t" \ + "vbroadcastsd %2, %%ymm11 \n\t" \ + "vbroadcastsd %3, %%ymm12 \n\t" \ + "vbroadcastsd %4, %%ymm13 \n\t" \ + "vbroadcastsd %5, %%ymm14 \n\t" \ + "vsubpd %%ymm1, %%ymm15, %%ymm1 \n\t" \ + "vmulpd %%ymm2, %%ymm9, %%ymm9 \n\t" \ + "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" \ + "vmulpd %%ymm2, %%ymm11, %%ymm11 \n\t" \ + "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" \ + "vaddpd %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %%ymm2, %%ymm13, %%ymm13 \n\t" \ + "vpermilpd $0x5, %%ymm1, %%ymm1 \n\t" \ + "vaddsubpd %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vsubpd %%ymm2, %%ymm15, %%ymm2 \n\t" \ + "vaddpd %%ymm11, %%ymm5, %%ymm5 \n\t" \ + "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" \ + "vpermilpd $0x5, %%ymm2, %%ymm2" \ + : \ + : \ + "m" ((u).c31.re), \ + "m" ((u).c12.im), \ + "m" ((u).c33.re), \ + "m" ((u).c11.im), \ + "m" ((u).c32.re), \ + "m" ((u).c13.im) \ + : \ + "xmm1", "xmm2", "xmm3", "xmm4", \ + "xmm5", "xmm9", "xmm10", "xmm11", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vbroadcastsd %0, %%ymm6 \n\t" \ + "vbroadcastsd %1, %%ymm7 \n\t" \ + "vbroadcastsd %2, %%ymm8 \n\t" \ + "vbroadcastsd %3, %%ymm9 \n\t" \ + "vbroadcastsd %4, %%ymm10 \n\t" \ + "vbroadcastsd %5, %%ymm11 \n\t" \ + "vmulpd %%ymm1, %%ymm6, %%ymm6 \n\t" \ + "vaddsubpd %%ymm12, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %%ymm2, %%ymm7, %%ymm7 \n\t" \ + "vaddpd %%ymm13, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %%ymm1, %%ymm8, %%ymm8 \n\t" \ + "vaddsubpd %%ymm14, %%ymm5, %%ymm5 \n\t" \ + "vmulpd %%ymm2, %%ymm9, %%ymm9 \n\t" \ + "vaddsubpd %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %%ymm1, %%ymm10, %%ymm10 \n\t" \ + "vaddsubpd %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %%ymm2, %%ymm11, %%ymm11 \n\t" \ + "vaddsubpd %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + "m" ((u).c21.im), \ + "m" ((u).c32.im), \ + "m" ((u).c23.im), \ + "m" ((u).c31.im), \ + "m" ((u).c22.im), \ + "m" ((u).c33.im) \ + : \ + "xmm3", "xmm4", "xmm5", "xmm6", \ + "xmm7", "xmm8", "xmm9", "xmm10", \ + "xmm11"); \ +__asm__ __volatile__ ("vaddsubpd %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddsubpd %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddsubpd %%ymm11, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + +/****************************************************************************** +* +* Macros for double precision Dirac spinors in linear order. +* +******************************************************************************/ + +/* +* Loads the spinor s to the registers ymm0,..,ymm5 in linear order. +*/ + +#define _avx_spinor_load_dble(s) \ +__asm__ __volatile__ ("vmovapd %0, %%ymm0 \n\t" \ + "vmovapd %2, %%ymm1 \n\t" \ + "vmovapd %4, %%ymm2" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm0", "xmm1", "xmm2"); \ +__asm__ __volatile__ ("vmovapd %0, %%ymm3 \n\t" \ + "vmovapd %2, %%ymm4 \n\t" \ + "vmovapd %4, %%ymm5" \ + : \ + : \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2), \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Loads the spinor s to the registers ymm6,..,ymm11 in linear order. +*/ + +#define _avx_spinor_load_up_dble(s) \ +__asm__ __volatile__ ("vmovapd %0, %%ymm6 \n\t" \ + "vmovapd %2, %%ymm7 \n\t" \ + "vmovapd %4, %%ymm8" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vmovapd %0, %%ymm9 \n\t" \ + "vmovapd %2, %%ymm10 \n\t" \ + "vmovapd %4, %%ymm11" \ + : \ + : \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2), \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm9", "xmm10", "xmm11") + +/* +* Stores the registers ymm0,..,ymm5 to the spinor s in linear order. +*/ + +#define _avx_spinor_store_dble(s) \ +__asm__ __volatile__ ("vmovapd %%ymm0, %0 \n\t" \ + "vmovapd %%ymm1, %2 \n\t" \ + "vmovapd %%ymm2, %4" \ + : \ + "=m" ((s).c1.c1), \ + "=m" ((s).c1.c2), \ + "=m" ((s).c1.c3), \ + "=m" ((s).c2.c1), \ + "=m" ((s).c2.c2), \ + "=m" ((s).c2.c3)); \ +__asm__ __volatile__ ("vmovapd %%ymm3, %0 \n\t" \ + "vmovapd %%ymm4, %2 \n\t" \ + "vmovapd %%ymm5, %4" \ + : \ + "=m" ((s).c3.c1), \ + "=m" ((s).c3.c2), \ + "=m" ((s).c3.c3), \ + "=m" ((s).c4.c1), \ + "=m" ((s).c4.c2), \ + "=m" ((s).c4.c3)) + +/* +* Stores the registers ymm6,..,ymm11 to the spinor s in linear order. +*/ + +#define _avx_spinor_store_up_dble(s) \ +__asm__ __volatile__ ("vmovapd %%ymm6, %0 \n\t" \ + "vmovapd %%ymm7, %2 \n\t" \ + "vmovapd %%ymm8, %4" \ + : \ + "=m" ((s).c1.c1), \ + "=m" ((s).c1.c2), \ + "=m" ((s).c1.c3), \ + "=m" ((s).c2.c1), \ + "=m" ((s).c2.c2), \ + "=m" ((s).c2.c3)); \ +__asm__ __volatile__ ("vmovapd %%ymm9, %0 \n\t" \ + "vmovapd %%ymm10, %2 \n\t" \ + "vmovapd %%ymm11, %4" \ + : \ + "=m" ((s).c3.c1), \ + "=m" ((s).c3.c2), \ + "=m" ((s).c3.c3), \ + "=m" ((s).c4.c1), \ + "=m" ((s).c4.c2), \ + "=m" ((s).c4.c3)) + +/* +* Loads (z.re,z.re,z.re,z.re) to ymm12 and (-z.im,z.im,-z.im,z.im) to ymm13. +*/ + +#define _avx_load_cmplx_dble(z) \ +__asm__ __volatile__ ("vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" \ + "vbroadcastsd %0, %%ymm12 \n\t" \ + "vaddsubpd %%ymm12, %%ymm13, %%ymm13 \n\t" \ + "vbroadcastsd %1, %%ymm12" \ + : \ + : \ + "m" ((z).im), \ + "m" ((z).re) \ + : \ + "xmm12", "xmm13") + +/* +* Loads (z.re,z.re,z.re,z.re) to ymm14 and (-z.im,z.im,-z.im,z.im) to ymm15. +*/ + +#define _avx_load_cmplx_up_dble(z) \ +__asm__ __volatile__ ("vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" \ + "vbroadcastsd %0, %%ymm14 \n\t" \ + "vaddsubpd %%ymm14, %%ymm15, %%ymm15 \n\t" \ + "vbroadcastsd %1, %%ymm14" \ + : \ + : \ + "m" ((z).im), \ + "m" ((z).re) \ + : \ + "xmm14", "xmm15") + +/* +* Multiplies the spinor s by the complex number z and assigns the result to +* ymm0,..,ymm5, assuming z was loaded using _avx_load_cmplx_dble(z). The +* registers ymm6,..,ymm11 are used as workspace. +*/ + +#define _avx_mulc_spinor_dble(s) \ +_avx_spinor_load_dble(s); \ +__asm__ __volatile__ ("vpermilpd $0x5, %%ymm0, %%ymm6 \n\t" \ + "vpermilpd $0x5, %%ymm1, %%ymm7 \n\t" \ + "vpermilpd $0x5, %%ymm2, %%ymm8 \n\t" \ + "vpermilpd $0x5, %%ymm3, %%ymm9 \n\t" \ + "vpermilpd $0x5, %%ymm4, %%ymm10 \n\t" \ + "vpermilpd $0x5, %%ymm5, %%ymm11 \n\t" \ + "vmulpd %%ymm12, %%ymm0, %%ymm0 \n\t" \ + "vmulpd %%ymm13, %%ymm6, %%ymm6 \n\t" \ + "vmulpd %%ymm12, %%ymm1, %%ymm1 \n\t" \ + "vmulpd %%ymm13, %%ymm7, %%ymm7 \n\t" \ + "vmulpd %%ymm12, %%ymm2, %%ymm2 \n\t" \ + "vmulpd %%ymm13, %%ymm8, %%ymm8" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vmulpd %%ymm12, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %%ymm13, %%ymm9, %%ymm9 \n\t" \ + "vmulpd %%ymm12, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %%ymm13, %%ymm10, %%ymm10 \n\t" \ + "vmulpd %%ymm12, %%ymm5, %%ymm5 \n\t" \ + "vmulpd %%ymm13, %%ymm11, %%ymm11 \n\t" \ + "vaddpd %%ymm6, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm7, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm8, %%ymm2, %%ymm2 \n\t" \ + "vaddpd %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddpd %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddpd %%ymm11, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5", \ + "xmm9", "xmm10", "xmm11") + +/* +* Multiplies the spinor s by the complex number z and adds the result to +* ymm0,..,ymm5, assuming z was loaded using _avx_load_cmplx_up_dble(z). The +* registers ymm6,..,ymm11 are used as workspace. +*/ + +#define _avx_mulc_spinor_add_dble(s) \ +__asm__ __volatile__ ("vmovapd %0, %%ymm6 \n\t" \ + "vmovapd %2, %%ymm7 \n\t" \ + "vmovapd %4, %%ymm8" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vpermilpd $0x5, %%ymm6, %%ymm9 \n\t" \ + "vpermilpd $0x5, %%ymm7, %%ymm10 \n\t" \ + "vpermilpd $0x5, %%ymm8, %%ymm11 \n\t" \ + "vmulpd %%ymm14, %%ymm6, %%ymm6 \n\t" \ + "vmulpd %%ymm15, %%ymm9, %%ymm9 \n\t" \ + "vmulpd %%ymm14, %%ymm7, %%ymm7 \n\t" \ + "vmulpd %%ymm15, %%ymm10, %%ymm10 \n\t" \ + "vmulpd %%ymm14, %%ymm8, %%ymm8 \n\t" \ + "vmulpd %%ymm15, %%ymm11, %%ymm11 \n\t" \ + "vaddpd %%ymm6, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm7, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm8, %%ymm2, %%ymm2 \n\t" \ + "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm10, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm11, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vmovapd %0, %%ymm6 \n\t" \ + "vmovapd %2, %%ymm7 \n\t" \ + "vmovapd %4, %%ymm8" \ + : \ + : \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2), \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vpermilpd $0x5, %%ymm6, %%ymm9 \n\t" \ + "vpermilpd $0x5, %%ymm7, %%ymm10 \n\t" \ + "vpermilpd $0x5, %%ymm8, %%ymm11 \n\t" \ + "vmulpd %%ymm14, %%ymm6, %%ymm6 \n\t" \ + "vmulpd %%ymm15, %%ymm9, %%ymm9 \n\t" \ + "vmulpd %%ymm14, %%ymm7, %%ymm7 \n\t" \ + "vmulpd %%ymm15, %%ymm10, %%ymm10 \n\t" \ + "vmulpd %%ymm14, %%ymm8, %%ymm8 \n\t" \ + "vmulpd %%ymm15, %%ymm11, %%ymm11 \n\t" \ + "vaddpd %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddpd %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddpd %%ymm8, %%ymm5, %%ymm5 \n\t" \ + "vaddpd %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddpd %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddpd %%ymm11, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11") + +/* +* Loads (c,c,c,c) to ymm12 and ymm13. +*/ + +#define _avx_load_real_dble(c) \ +__asm__ __volatile__ ("vbroadcastsd %0, %%ymm12 \n\t" \ + "vbroadcastsd %0, %%ymm13" \ + : \ + : \ + "m" (c) \ + : \ + "xmm12", "xmm13") + +/* +* Loads (c,c,c,c) to ymm14 and ymm15. +*/ + +#define _avx_load_real_up_dble(c) \ +__asm__ __volatile__ ("vbroadcastsd %0, %%ymm14 \n\t" \ + "vbroadcastsd %0, %%ymm15" \ + : \ + : \ + "m" (c) \ + : \ + "xmm14", "xmm15") + +/* +* Multiplies the spinor s by the real number c and assigns the result to +* ymm0,..,ymm5, assuming c was loaded using _avx_load_real_dble(c). +*/ + +#define _avx_mulr_spinor_dble(s) \ +_avx_spinor_load_dble(s); \ +__asm__ __volatile__ ("vmulpd %%ymm12, %%ymm0, %%ymm0 \n\t" \ + "vmulpd %%ymm13, %%ymm1, %%ymm1 \n\t" \ + "vmulpd %%ymm12, %%ymm2, %%ymm2 \n\t" \ + "vmulpd %%ymm13, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %%ymm12, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %%ymm13, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm0", "xmm1", "xmm2") + + +/* +* Multiplies the spinor s by the real number c and adds the result to +* ymm0,..,ymm5, assuming c was loaded using _avx_load_real_up_dble(c). +* The registers ymm6,..,ymm11 are used as workspace. +*/ + +#define _avx_mulr_spinor_add_dble(s) \ +_avx_spinor_load_up_dble(s); \ +__asm__ __volatile__ ("vmulpd %%ymm14, %%ymm6, %%ymm6 \n\t" \ + "vmulpd %%ymm15, %%ymm7, %%ymm7 \n\t" \ + "vmulpd %%ymm14, %%ymm8, %%ymm8 \n\t" \ + "vmulpd %%ymm15, %%ymm9, %%ymm9 \n\t" \ + "vmulpd %%ymm14, %%ymm10, %%ymm10 \n\t" \ + "vmulpd %%ymm15, %%ymm11, %%ymm11 \n\t" \ + "vaddpd %%ymm6, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm7, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm8, %%ymm2, %%ymm2 \n\t" \ + "vaddpd %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddpd %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddpd %%ymm11, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11") + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/block.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/block.h new file mode 100644 index 0000000000000000000000000000000000000000..b8f9fd4bcf7cce8b5b791dff762268338028855c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/block.h @@ -0,0 +1,83 @@ + +/******************************************************************************* +* +* File block.h +* +* Copyright (C) 2005, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef BLOCK_H +#define BLOCK_H + +#ifndef SU3_H +#include "su3.h" +#endif + +#ifndef UTILS_H +#include "utils.h" +#endif + +typedef struct +{ + int ifc,ibn,vol,nw,nwd; + int *ipp,*map,*imb; + su3 *u; + su3_dble *ud; + weyl **w; + weyl_dble **wd; +} bndry_t; + +typedef struct +{ + int *bo,*bs,vol,vbb,nbp,ns,nsd,shf; + int *ipt,*imb,*ibp; + int (*iup)[4],(*idn)[4]; + su3 *u; + su3_dble *ud; + pauli *sw; + pauli_dble *swd; + spinor **s; + spinor_dble **sd; + bndry_t *bb; +} block_t; + +typedef enum +{ + SAP_BLOCKS,DFL_BLOCKS, + BLK_GRIDS +} blk_grid_t; + +/* BLOCK_C */ +extern void alloc_blk(block_t *b,int *bo,int *bs, + int iu,int iud,int ns,int nsd); +extern void alloc_bnd(block_t *b,int iu,int iud,int nw,int nwd); +extern void clone_blk(block_t *b,int shf,int *bo,block_t *c); +extern void free_blk(block_t *b); +extern int ipt_blk(block_t *b,int *x); + +/* BLK_GRID_C */ +extern void alloc_bgr(blk_grid_t grid); +extern block_t *blk_list(blk_grid_t grid,int *nb,int *isw); + +/* MAP_U2BLK_C */ +extern void assign_ud2ubgr(blk_grid_t grid); +extern void assign_ud2udblk(blk_grid_t grid,int n); + +/* MAP_SW2BLK_C */ +extern int assign_swd2swbgr(blk_grid_t grid,ptset_t set); +extern int assign_swd2swdblk(blk_grid_t grid,int n,ptset_t set); + +/* MAP_S2BLK_C */ +extern void assign_s2sblk(blk_grid_t grid,int n,ptset_t set,spinor *s,int k); +extern void assign_sblk2s(blk_grid_t grid,int n,ptset_t set,int k,spinor *s); +extern void assign_s2sdblk(blk_grid_t grid,int n,ptset_t set,spinor *s,int k); +extern void assign_sd2sdblk(blk_grid_t grid,int n,ptset_t set, + spinor_dble *sd,int k); +extern void assign_sdblk2sd(blk_grid_t grid,int n,ptset_t set, + int k,spinor_dble *sd); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/dfl.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/dfl.h new file mode 100644 index 0000000000000000000000000000000000000000..352338dd7bb93d063afbb11af1b2935765b747c6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/dfl.h @@ -0,0 +1,57 @@ + +/******************************************************************************* +* +* File dfl.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef DFL_H +#define DFL_H + +#ifndef SU3_H +#include "su3.h" +#endif + +typedef struct +{ + int nb,nbb; + int nbbe[8],nbbo[8]; + int obbe[8],obbo[8]; + int (*inn)[8]; + int *idx,*ipp,*map; +} dfl_grid_t; + +/* DFL_GEOMETRY_C */ +extern dfl_grid_t dfl_geometry(void); + +/* DFL_MODES_C */ +extern void dfl_modes(int *status); +extern void dfl_update(int nsm,int *status); +extern void dfl_modes2(int *status); +extern void dfl_update2(int nsm,int *status); + +/* DFL_SAP_GCR_C */ +extern double dfl_sap_gcr(int nkv,int nmx,double res,double mu, + spinor_dble *eta,spinor_dble *psi,int *status); +extern double dfl_sap_gcr2(int nkv,int nmx,double res,double mu, + spinor_dble *eta,spinor_dble *psi,int *status); + +/* DFL_SUBSPACE_C */ +extern void dfl_sd2vd(spinor_dble *sd,complex_dble *vd); +extern void dfl_vd2sd(complex_dble *vd,spinor_dble *sd); +extern void dfl_sub_vd2sd(complex_dble *vd,spinor_dble *sd); +extern void dfl_s2v(spinor *s,complex *v); +extern void dfl_v2s(complex *v,spinor *s); +extern void dfl_sub_v2s(complex *v,spinor *s); +extern void dfl_subspace(spinor **mds); + +/* LTL_GCR */ +extern double ltl_gcr(int nkv,int nmx,double res,double mu, + complex_dble *eta,complex_dble *psi,int *status); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/dirac.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/dirac.h new file mode 100644 index 0000000000000000000000000000000000000000..4681830214158c8e2a076667f28316330bed9a38 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/dirac.h @@ -0,0 +1,55 @@ + +/******************************************************************************* +* +* File dirac.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef DIRAC_H +#define DIRAC_H + +#ifndef SU3_H +#include "su3.h" +#endif + +#ifndef BLOCK_H +#include "block.h" +#endif + +/* DW_BND_C */ +extern void Dw_bnd(blk_grid_t grid,int n,int k,int l); + +/* DW_C */ +extern void Dw(float mu,spinor *s,spinor *r); +extern void Dwee(float mu,spinor *s,spinor *r); +extern void Dwoo(float mu,spinor *s,spinor *r); +extern void Dweo(spinor *s,spinor *r); +extern void Dwoe(spinor *s,spinor *r); +extern void Dwhat(float mu,spinor *s,spinor *r); +extern void Dw_blk(blk_grid_t grid,int n,float mu,int k,int l); +extern void Dwee_blk(blk_grid_t grid,int n,float mu,int k,int l); +extern void Dwoo_blk(blk_grid_t grid,int n,float mu,int k,int l); +extern void Dwoe_blk(blk_grid_t grid,int n,int k,int l); +extern void Dweo_blk(blk_grid_t grid,int n,int k,int l); +extern void Dwhat_blk(blk_grid_t grid,int n,float mu,int k,int l); + +/* DW_DBLE_C */ +extern void Dw_dble(double mu,spinor_dble *s,spinor_dble *r); +extern void Dwee_dble(double mu,spinor_dble *s,spinor_dble *r); +extern void Dwoo_dble(double mu,spinor_dble *s,spinor_dble *r); +extern void Dweo_dble(spinor_dble *s,spinor_dble *r); +extern void Dwoe_dble(spinor_dble *s,spinor_dble *r); +extern void Dwhat_dble(double mu,spinor_dble *s,spinor_dble *r); +extern void Dw_blk_dble(blk_grid_t grid,int n,double mu,int k,int l); +extern void Dwee_blk_dble(blk_grid_t grid,int n,double mu,int k,int l); +extern void Dwoo_blk_dble(blk_grid_t grid,int n,double mu,int k,int l); +extern void Dwoe_blk_dble(blk_grid_t grid,int n,int k,int l); +extern void Dweo_blk_dble(blk_grid_t grid,int n,int k,int l); +extern void Dwhat_blk_dble(blk_grid_t grid,int n,double mu,int k,int l); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags.h new file mode 100644 index 0000000000000000000000000000000000000000..2545d8ecad07f12f83939151a91314e878866fef --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags.h @@ -0,0 +1,313 @@ + +/******************************************************************************* +* +* File flags.h +* +* Copyright (C) 2009-2014 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef FLAGS_H +#define FLAGS_H + +#ifndef BLOCK_H +#include "block.h" +#endif + +typedef enum +{ + UPDATED_U,UPDATED_UD,ASSIGNED_UD2U, + COPIED_BND_UD,SET_BSTAP,SHIFTED_UD,COMPUTED_FTS, + ERASED_SW,ERASED_SWD,COMPUTED_SWD,ASSIGNED_SWD2SW, + INVERTED_SW_E,INVERTED_SW_O, + INVERTED_SWD_E,INVERTED_SWD_O, + ASSIGNED_U2UBGR,ASSIGNED_UD2UBGR,ASSIGNED_UD2UDBGR, + ASSIGNED_SWD2SWBGR,ASSIGNED_SWD2SWDBGR, + ERASED_AW,ERASED_AWHAT,COMPUTED_AW,COMPUTED_AWHAT, + EVENTS +} event_t; + +typedef enum +{ + U_MATCH_UD,UDBUF_UP2DATE,BSTAP_UP2DATE, + FTS_UP2DATE,UBGR_MATCH_UD,UDBGR_MATCH_UD, + SW_UP2DATE,SW_E_INVERTED,SW_O_INVERTED, + SWD_UP2DATE,SWD_E_INVERTED,SWD_O_INVERTED, + AW_UP2DATE,AWHAT_UP2DATE, + QUERIES +} query_t; + +typedef enum +{ + ACG,ACF_TM1,ACF_TM1_EO,ACF_TM1_EO_SDET, + ACF_TM2,ACF_TM2_EO,ACF_RAT,ACF_RAT_SDET, + ACTIONS +} action_t; + +typedef enum +{ + LPFR,OMF2,OMF4, + INTEGRATORS +} integrator_t; + +typedef enum +{ + FRG,FRF_TM1,FRF_TM1_EO,FRF_TM1_EO_SDET, + FRF_TM2,FRF_TM2_EO,FRF_RAT,FRF_RAT_SDET, + FORCES +} force_t; + +typedef enum +{ + RWTM1,RWTM1_EO,RWTM2,RWTM2_EO,RWRAT, + RWFACTS +} rwfact_t; + +typedef enum +{ + CGNE,MSCG,SAP_GCR,DFL_SAP_GCR, + SOLVERS +} solver_t; + +typedef struct +{ + action_t action; + int ipf,im0; + int irat[3],imu[4]; + int isp[4]; +} action_parms_t; + +typedef struct +{ + int type; + double cG[2],cF[2]; + double phi[2][3]; +} bc_parms_t; + +typedef struct +{ + int bs[4]; + int Ns; +} dfl_parms_t; + +typedef struct +{ + int nkv,nmx; + double res; +} dfl_pro_parms_t; + +typedef struct +{ + int ninv,nmr,ncy; + double kappa,m0,mu; +} dfl_gen_parms_t; + +typedef struct +{ + int nsm; + double dtau; +} dfl_upd_parms_t; + +typedef struct +{ + force_t force; + int ipf,im0; + int irat[3],imu[4]; + int isp[4]; + int ncr[4],icr[4]; +} force_parms_t; + +typedef struct +{ + int npf,nlv; + int nact,nmu; + int *iact; + double tau,*mu; +} hmc_parms_t; + +typedef struct +{ + int nk; + double beta,c0,c1; + double *kappa,*m0; + double csw; +} lat_parms_t; + +typedef struct +{ + integrator_t integrator; + double lambda; + int nstep,nfr; + int *ifr; +} mdint_parms_t; + +typedef struct +{ + int degree; + double range[2]; +} rat_parms_t; + +typedef struct +{ + rwfact_t rwfact; + int im0,nsrc; + int irp,nfct; + double *mu; + int *np,*isp; +} rw_parms_t; + +typedef struct +{ + double m0,csw,cF[2]; +} sw_parms_t; + +typedef struct +{ + int bs[4]; + int isolv; + int nmr,ncy; +} sap_parms_t; + +typedef struct +{ + solver_t solver; + int nmx,nkv; + int isolv,nmr,ncy; + double res; +} solver_parms_t; + +typedef struct +{ + int eoflg; +} tm_parms_t; + +typedef struct +{ + int n; + double eps; +} wflow_parms_t; + +/* FLAGS_C */ +extern void set_flags(event_t event); +extern void set_grid_flags(blk_grid_t grid,event_t event); +extern int query_flags(query_t query); +extern int query_grid_flags(blk_grid_t grid,query_t query); +extern void print_flags(void); +extern void print_grid_flags(blk_grid_t grid); + +/* ACTION_PARMS_C */ +extern action_parms_t set_action_parms(int iact,action_t action,int ipf, + int im0,int *irat,int *imu,int *isp); +extern action_parms_t action_parms(int iact); +extern void read_action_parms(int iact); +extern void print_action_parms(void); +extern void write_action_parms(FILE *fdat); +extern void check_action_parms(FILE *fdat); + +/* DFL_PARMS_C */ +extern dfl_parms_t set_dfl_parms(int *bs,int Ns); +extern dfl_parms_t dfl_parms(void); +extern dfl_pro_parms_t set_dfl_pro_parms(int nkv,int nmx,double res); +extern dfl_pro_parms_t dfl_pro_parms(void); +extern dfl_gen_parms_t set_dfl_gen_parms(double kappa,double mu, + int ninv,int nmr,int ncy); +extern dfl_gen_parms_t dfl_gen_parms(void); +extern dfl_upd_parms_t set_dfl_upd_parms(double dtau,int nsm); +extern dfl_upd_parms_t dfl_upd_parms(void); +extern void print_dfl_parms(int ipr); +extern void write_dfl_parms(FILE *fdat); +extern void check_dfl_parms(FILE *fdat); + +/* FORCE_PARMS_C */ +extern force_parms_t set_force_parms(int ifr,force_t force,int ipf,int im0, + int *irat,int *imu,int *isp,int *ncr); +extern force_parms_t force_parms(int ifr); +extern void read_force_parms(int ifr); +extern void read_force_parms2(int ifr); +extern void print_force_parms(void); +extern void print_force_parms2(void); +extern void write_force_parms(FILE *fdat); +extern void check_force_parms(FILE *fdat); + +/* HMC_PARMS_C */ +extern hmc_parms_t set_hmc_parms(int nact,int *iact,int npf, + int nmu,double *mu,int nlv,double tau); +extern hmc_parms_t hmc_parms(void); +extern void print_hmc_parms(void); +extern void write_hmc_parms(FILE *fdat); +extern void check_hmc_parms(FILE *fdat); + +/* LAT_PARMS_C */ +extern lat_parms_t set_lat_parms(double beta,double c0, + int nk,double *kappa,double csw); +extern lat_parms_t lat_parms(void); +extern void print_lat_parms(void); +extern void write_lat_parms(FILE *fdat); +extern void check_lat_parms(FILE *fdat); +extern bc_parms_t set_bc_parms(int type, + double cG,double cG_prime, + double cF,double cF_prime, + double *phi,double *phi_prime); +extern bc_parms_t bc_parms(void); +extern void print_bc_parms(void); +extern void write_bc_parms(FILE *fdat); +extern void check_bc_parms(FILE *fdat); +extern double sea_quark_mass(int im0); +extern int bc_type(void); +extern sw_parms_t set_sw_parms(double m0); +extern sw_parms_t sw_parms(void); +extern tm_parms_t set_tm_parms(int eoflg); +extern tm_parms_t tm_parms(void); + +/* MDINT_PARMS_C */ +extern mdint_parms_t set_mdint_parms(int ilv, + integrator_t integrator,double lambda, + int nstep,int nfr,int *ifr); +extern mdint_parms_t mdint_parms(int ilv); +extern void read_mdint_parms(int ilv); +extern void print_mdint_parms(void); +extern void write_mdint_parms(FILE *fdat); +extern void check_mdint_parms(FILE *fdat); + +/* RAT_PARMS_C */ +extern rat_parms_t set_rat_parms(int irp,int degree,double *range); +extern rat_parms_t rat_parms(int irp); +extern void read_rat_parms(int irp); +extern void print_rat_parms(void); +extern void write_rat_parms(FILE *fdat); +extern void check_rat_parms(FILE *fdat); + +/* RW_PARMS_C */ +extern rw_parms_t set_rw_parms(int irw,rwfact_t rwfact,int im0,int nsrc, + int irp,int nfct,double *mu,int *np,int *isp); +extern rw_parms_t rw_parms(int irw); +extern void read_rw_parms(int irw); +extern void print_rw_parms(void); +extern void write_rw_parms(FILE *fdat); +extern void check_rw_parms(FILE *fdat); + +/* SAP_PARMS_C */ +extern sap_parms_t set_sap_parms(int *bs,int isolv,int nmr,int ncy); +extern sap_parms_t sap_parms(void); +extern void print_sap_parms(int ipr); +extern void write_sap_parms(FILE *fdat); +extern void check_sap_parms(FILE *fdat); + +/* SOLVER_PARMS_C */ +extern solver_parms_t set_solver_parms(int isp,solver_t solver, + int nkv,int isolv,int nmr,int ncy, + int nmx,double res); +extern solver_parms_t solver_parms(int isp); +extern void read_solver_parms(int isp); +extern void print_solver_parms(int *isap,int *idfl); +extern void write_solver_parms(FILE *fdat); +extern void check_solver_parms(FILE *fdat); + +/* WFLOW_PARMS_C */ +extern wflow_parms_t set_wflow_parms(int n,double eps); +extern wflow_parms_t wflow_parms(void); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/events.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/events.h new file mode 100644 index 0000000000000000000000000000000000000000..4067350c27bd1547895ad19d34a257069d8bc1fb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/events.h @@ -0,0 +1,137 @@ + +/******************************************************************************* +* +* File flags/events.h +* +* Copyright (C) 2009, 2010, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Full-lattice events +* +*******************************************************************************/ + +#define EVENTS_H + +#if (defined FLAGS_C) + +static void (*event_fcts[(int)(EVENTS)+1])(void)={NULL}; + + +static void LatUpdatedU(void) +{ + lat.u=next_tag(); +} + +static void LatUpdatedUd(void) +{ + lat.ud=next_tag(); +} + +static void LatAssignedUd2u(void) +{ + lat.u=lat.ud; +} + +static void LatCopiedBndUd(void) +{ + lat.udbuf=lat.ud; +} + +static void LatSetBstap(void) +{ + lat.bstap=lat.ud; +} + +static void LatShiftedUd(void) +{ + lat.ud=next_tag(); + lat.udbuf=0; +} + +static void LatComputedFts(void) +{ + lat.fts=lat.ud; +} + +static void LatErasedSw(void) +{ + lat.sw[0]=0; + lat.sw[1]=0; + lat.sw[2]=0; +} + +static void LatErasedSwd(void) +{ + lat.swd[0]=0; + lat.swd[1]=0; + lat.swd[2]=0; +} + +static void LatComputedSwd(void) +{ + lat.swd[0]=lat.ud; + lat.swd[1]=0; + lat.swd[2]=0; +} + +static void LatAssignedSwd2sw(void) +{ + lat.sw[0]=lat.swd[0]; + lat.sw[1]=lat.swd[1]; + lat.sw[2]=lat.swd[2]; +} + +static void LatInvertedSwdE(void) +{ + lat.swd[1]^=0x1; +} + +static void LatInvertedSwdO(void) +{ + lat.swd[2]^=0x1; +} + +static void LatErasedAw(void) +{ + lat.aw=0; +} + +static void LatErasedAwhat(void) +{ + lat.awh=0; +} + +static void LatComputedAw(void) +{ + lat.aw=lat.ud; +} + +static void LatComputedAwhat(void) +{ + lat.awh=lat.ud; +} + +static void set_events(void) +{ + event_fcts[(int)(UPDATED_U)]=LatUpdatedU; + event_fcts[(int)(UPDATED_UD)]=LatUpdatedUd; + event_fcts[(int)(ASSIGNED_UD2U)]=LatAssignedUd2u; + event_fcts[(int)(COPIED_BND_UD)]=LatCopiedBndUd; + event_fcts[(int)(SET_BSTAP)]=LatSetBstap; + event_fcts[(int)(SHIFTED_UD)]=LatShiftedUd; + event_fcts[(int)(COMPUTED_FTS)]=LatComputedFts; + event_fcts[(int)(ERASED_SW)]=LatErasedSw; + event_fcts[(int)(ERASED_SWD)]=LatErasedSwd; + event_fcts[(int)(COMPUTED_SWD)]=LatComputedSwd; + event_fcts[(int)(ASSIGNED_SWD2SW)]=LatAssignedSwd2sw; + event_fcts[(int)(INVERTED_SWD_E)]=LatInvertedSwdE; + event_fcts[(int)(INVERTED_SWD_O)]=LatInvertedSwdO; + event_fcts[(int)(ERASED_AW)]=LatErasedAw; + event_fcts[(int)(ERASED_AWHAT)]=LatErasedAwhat; + event_fcts[(int)(COMPUTED_AW)]=LatComputedAw; + event_fcts[(int)(COMPUTED_AWHAT)]=LatComputedAwhat; +} + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/grid_events.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/grid_events.h new file mode 100644 index 0000000000000000000000000000000000000000..974c617f078193cd63429ee2e4cb1a0e5f1e1595 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/grid_events.h @@ -0,0 +1,151 @@ + +/******************************************************************************* +* +* File grid_events.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Block grid events +* +*******************************************************************************/ + +#define GRID_EVENTS_H + +#if (defined FLAGS_C) + +static void (*grid_event_fcts[(int)(EVENTS)+1])(void)={NULL}; + +static void GridAssignedU2ubgr(void) +{ + if ((*gf).shf&0x1) + error_root(1,1,"GridAssignedU2ubgr [grid_events.h]", + "Event involving shared fields"); + else + (*gf).u=lat.u; +} + +static void GridAssignedUd2ubgr(void) +{ + if ((*gf).shf&0x1) + error_root(1,1,"GridAssignedUd2ubgr [grid_events.h]", + "Event involving shared fields"); + else + (*gf).u=lat.ud; +} + +static void GridAssignedUd2udbgr(void) +{ + if ((*gf).shf&0x2) + error_root(1,1,"GridAssignedUd2udbgr [grid_events.h]", + "Event involving shared fields"); + else + (*gf).ud=lat.ud; +} + +static void GridAssignedSwd2swbgr(void) +{ + if ((*gf).shf&0x1) + error_root(1,1,"GridAssignedSwd2swbgr [grid_events.h]", + "Event involving shared fields"); + else + { + (*gf).sw[0]=lat.swd[0]; + (*gf).sw[1]=lat.swd[1]; + (*gf).sw[2]=lat.swd[2]; + } +} + +static void GridAssignedSwd2swdbgr(void) +{ + if ((*gf).shf&0x2) + error_root(1,1,"GridAssignedSwd2swdbgr [grid_events.h]", + "Event involving shared fields"); + else + { + (*gf).swd[0]=lat.swd[0]; + (*gf).swd[1]=lat.swd[1]; + (*gf).swd[2]=lat.swd[2]; + } +} + +static void GridInvertedSwdE(void) +{ + if ((*gf).shf&0x2) + error_root(1,1,"GridInvertedSwdE [grid_events.h]", + "Event involving shared fields"); + else + (*gf).swd[1]^=0x1; +} + +static void GridInvertedSwdO(void) +{ + if ((*gf).shf&0x2) + error_root(1,1,"GridInvertedSwdO [grid_events.h]", + "Event involving shared fields"); + else + (*gf).swd[2]^=0x1; +} + +static void GridInvertedSwE(void) +{ + if ((*gf).shf&0x1) + error_root(1,1,"GridInvertedSwE [grid_events.h]", + "Event involving shared fields"); + else + (*gf).sw[1]^=0x1; +} + +static void GridInvertedSwO(void) +{ + if ((*gf).shf&0x1) + error_root(1,1,"GridInvertedSwO [grid_events.h]", + "Event involving shared fields"); + else + (*gf).sw[2]^=0x1; +} + +static void GridErasedSw(void) +{ + if ((*gf).shf&0x1) + error_root(1,1,"GridErasedSw [grid_events.h]", + "Event involving shared fields"); + else + { + (*gf).sw[0]=0; + (*gf).sw[1]=0; + (*gf).sw[2]=0; + } +} + +static void GridErasedSwd(void) +{ + if ((*gf).shf&0x2) + error_root(1,1,"GridErasedSwd [grid_events.h]", + "Event involving shared fields"); + else + { + (*gf).swd[0]=0; + (*gf).swd[1]=0; + (*gf).swd[2]=0; + } +} + +static void set_grid_events(void) +{ + grid_event_fcts[(int)(ASSIGNED_U2UBGR)]=GridAssignedU2ubgr; + grid_event_fcts[(int)(ASSIGNED_UD2UBGR)]=GridAssignedUd2ubgr; + grid_event_fcts[(int)(ASSIGNED_UD2UDBGR)]=GridAssignedUd2udbgr; + grid_event_fcts[(int)(ASSIGNED_SWD2SWBGR)]=GridAssignedSwd2swbgr; + grid_event_fcts[(int)(ASSIGNED_SWD2SWDBGR)]=GridAssignedSwd2swdbgr; + grid_event_fcts[(int)(INVERTED_SWD_E)]=GridInvertedSwdE; + grid_event_fcts[(int)(INVERTED_SWD_O)]=GridInvertedSwdO; + grid_event_fcts[(int)(INVERTED_SW_E)]=GridInvertedSwE; + grid_event_fcts[(int)(INVERTED_SW_O)]=GridInvertedSwO; + grid_event_fcts[(int)(ERASED_SW)]=GridErasedSw; + grid_event_fcts[(int)(ERASED_SWD)]=GridErasedSwd; +} + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/grid_queries.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/grid_queries.h new file mode 100644 index 0000000000000000000000000000000000000000..a4daafc9be7bc974e80419368fb618df6fb02f68 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/grid_queries.h @@ -0,0 +1,129 @@ + +/******************************************************************************* +* +* File grid_queries.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Block grid queries +* +*******************************************************************************/ + +#define GRID_QUERIES_H + +#if (defined FLAGS_C) + +static int (*grid_query_fcts[(int)(QUERIES)+1])(void)={NULL}; + +static int GridQueryUbgrMatchUd(void) +{ + if ((*gf).shf&0x1) + { + error_loc(1,1,"GridQueryUbgrMatchUd [grid_queries.h]", + "Query involving shared fields"); + return -1; + } + else + return ((*gf).u==lat.ud); +} + +static int GridQueryUdbgrMatchUd(void) +{ + if ((*gf).shf&0x2) + { + error_loc(1,1,"GridQueryUdbgrMatchUd [grid_queries.h]", + "Query involving shared fields"); + return -1; + } + else + return ((*gf).ud==lat.ud); +} + +static int GridQuerySwUp2date(void) +{ + if ((*gf).shf&0x1) + { + error_loc(1,1,"GridQuerySwUp2date [grid_queries.h]", + "Query involving shared fields"); + return -1; + } + else + return ((*gf).sw[0]==(*gf).u); +} + +static int GridQuerySwEInverted(void) +{ + if ((*gf).shf&0x1) + { + error_loc(1,1,"GridQuerySwEInverted [grid_queries.h]", + "Query involving shared fields"); + return -1; + } + else + return ((*gf).sw[1]==1); +} + +static int GridQuerySwOInverted(void) +{ + if ((*gf).shf&0x1) + { + error_loc(1,1,"GridQuerySwOInverted [grid_queries.h]", + "Query involving shared fields"); + return -1; + } + else + return ((*gf).sw[2]==1); +} + +static int GridQuerySwdUp2date(void) +{ + if ((*gf).shf&0x2) + { + error_loc(1,1,"GridQuerySwdUp2date [grid_queries.h]", + "Query involving shared fields"); + return -1; + } + else + return ((*gf).swd[0]==(*gf).ud); +} + +static int GridQuerySwdEInverted(void) +{ + if ((*gf).shf&0x2) + { + error_loc(1,1,"GridQuerySwdEInverted [grid_queries.h]", + "Query involving shared fields"); + return -1; + } + else + return ((*gf).swd[1]==1); +} + +static int GridQuerySwdOInverted(void) +{ + if ((*gf).shf&0x2) + { + error_loc(1,1,"GridQuerySwdOInverted [grid_queries.h]", + "Query involving shared fields"); + return -1; + } + else + return ((*gf).swd[2]==1); +} + +static void set_grid_queries(void) +{ + grid_query_fcts[(int)(UBGR_MATCH_UD)]=GridQueryUbgrMatchUd; + grid_query_fcts[(int)(UDBGR_MATCH_UD)]=GridQueryUdbgrMatchUd; + grid_query_fcts[(int)(SW_UP2DATE)]=GridQuerySwUp2date; + grid_query_fcts[(int)(SW_E_INVERTED)]=GridQuerySwEInverted; + grid_query_fcts[(int)(SW_O_INVERTED)]=GridQuerySwOInverted; + grid_query_fcts[(int)(SWD_UP2DATE)]=GridQuerySwdUp2date; + grid_query_fcts[(int)(SWD_E_INVERTED)]=GridQuerySwdEInverted; + grid_query_fcts[(int)(SWD_O_INVERTED)]=GridQuerySwdOInverted; +} + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/queries.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/queries.h new file mode 100644 index 0000000000000000000000000000000000000000..7525afd86bbe5551fcfb7a9d5c60b351e6a8a996 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/queries.h @@ -0,0 +1,97 @@ + +/******************************************************************************* +* +* File flags/queries.h +* +* Copyright (C) 2009, 2010, 2011, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Query descriptions +* +*******************************************************************************/ + +#define QUERIES_H + +#if (defined FLAGS_C) + +static int (*query_fcts[(int)(QUERIES)+1])(void)={NULL}; + +static int QueryUMatchUd(void) +{ + return (lat.u==lat.ud); +} + +static int QueryUdbufUp2date(void) +{ + return ((lat.ud>0)&&(lat.udbuf==lat.ud)); +} + +static int QueryBstapUp2date(void) +{ + return ((lat.ud>0)&&(lat.bstap==lat.ud)); +} + +static int QueryFtsUp2date(void) +{ + return ((lat.ud>0)&&(lat.fts==lat.ud)); +} + +static int QuerySwUp2date(void) +{ + return ((lat.u>0)&&(lat.sw[0]==lat.u)); +} + +static int QuerySwEInverted(void) +{ + return (lat.sw[1]==1); +} + +static int QuerySwOInverted(void) +{ + return (lat.sw[2]==1); +} + +static int QuerySwdUp2date(void) +{ + return ((lat.ud>0)&&(lat.swd[0]==lat.ud)); +} + +static int QuerySwdEInverted(void) +{ + return (lat.swd[1]==1); +} + +static int QuerySwdOInverted(void) +{ + return (lat.swd[2]==1); +} + +static int QueryAwUp2date(void) +{ + return ((lat.ud>0)&&(lat.aw==lat.ud)); +} + +static int QueryAwhatUp2date(void) +{ + return ((lat.ud>0)&&(lat.awh==lat.ud)); +} + +static void set_queries(void) +{ + query_fcts[(int)(U_MATCH_UD)]=QueryUMatchUd; + query_fcts[(int)(UDBUF_UP2DATE)]=QueryUdbufUp2date; + query_fcts[(int)(BSTAP_UP2DATE)]=QueryBstapUp2date; + query_fcts[(int)(FTS_UP2DATE)]=QueryFtsUp2date; + query_fcts[(int)(SW_UP2DATE)]=QuerySwUp2date; + query_fcts[(int)(SW_E_INVERTED)]=QuerySwEInverted; + query_fcts[(int)(SW_O_INVERTED)]=QuerySwOInverted; + query_fcts[(int)(SWD_UP2DATE)]=QuerySwdUp2date; + query_fcts[(int)(SWD_E_INVERTED)]=QuerySwdEInverted; + query_fcts[(int)(SWD_O_INVERTED)]=QuerySwdOInverted; + query_fcts[(int)(AW_UP2DATE)]=QueryAwUp2date; + query_fcts[(int)(AWHAT_UP2DATE)]=QueryAwhatUp2date; +} + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/forces.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/forces.h new file mode 100644 index 0000000000000000000000000000000000000000..90a8b50c7ab98bc91ff345badb17174f9ce671c6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/forces.h @@ -0,0 +1,90 @@ + +/******************************************************************************* +* +* File forces.h +* +* Copyright (C) 2011, 2012 Martin Luescher, Stefan Schaefer +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef FORCES_H +#define FORCES_H + +#ifndef SU3_H +#include "su3.h" +#endif + +#ifndef UTILS_H +#include "utils.h" +#endif + +/* FORCE0_C */ +extern void plaq_frc(void); +extern void force0(double c); +extern double action0(int icom); + +/* FORCE1_C */ +extern double setpf1(double mu,int ipf,int icom); +extern void force1(double mu,int ipf,int isp,int icr,double c,int *status); +extern double action1(double mu,int ipf,int isp,int icom,int *status); + +/* FORCE2_C */ +extern double setpf2(double mu0,double mu1,int ipf,int isp, + int icom,int *status); +extern void force2(double mu0,double mu1,int ipf,int isp,int icr, + double c,int *status); +extern double action2(double mu0,double mu1,int ipf,int isp, + int icom,int *status); + +/* FORCE3_C */ +extern double setpf3(int *irat,int ipf,int isw,int isp,int icom,int *status); +extern void force3(int *irat,int ipf,int isw,int isp,double c,int *status); +extern double action3(int *irat,int ipf,int isw,int isp,int icom,int *status); + +/* FORCE4_C */ +extern double setpf4(double mu,int ipf,int isw,int icom); +extern void force4(double mu,int ipf,int isw,int isp,int icr,double c, + int *status); +extern double action4(double mu,int ipf,int isw,int isp,int icom,int *status); + +/* FORCE5_C */ +extern double setpf5(double mu0,double mu1,int ipf,int isp,int icom, + int *status); +extern void force5(double mu0,double mu1,int ipf,int isp,int icr, + double c,int *status); +extern double action5(double mu0,double mu1,int ipf,int isp,int icom, + int *status); + +/* FRCFCTS_C */ +extern void det2xt(pauli_dble *m,u3_alg_dble *X); +extern void prod2xt(spinor_dble *r,spinor_dble *s,u3_alg_dble *X); +extern void (*prod2xv[])(spinor_dble *rx,spinor_dble *ry, + spinor_dble *sx,spinor_dble *sy,su3_dble *u); + +/* GENFRC_C */ +extern void sw_frc(double c); +extern void hop_frc(double c); + +/* TMCG_C */ +extern double tmcg(int nmx,double res,double mu, + spinor_dble *eta,spinor_dble *psi,int *status); +extern double tmcgeo(int nmx,double res,double mu, + spinor_dble *eta,spinor_dble *psi,int *status); + +/* TMCGM_C */ +extern void tmcgm(int nmx,double *res,int nmu,double *mu, + spinor_dble *eta,spinor_dble **psi,int *status); + +/* XTENSOR_C */ +extern u3_alg_dble **xtensor(void); +extern void set_xt2zero(void); +extern int add_det2xt(double c,ptset_t set); +extern void add_prod2xt(double c,spinor_dble *r,spinor_dble *s); +extern su3_dble *xvector(void); +extern void set_xv2zero(void); +extern void add_prod2xv(double c,spinor_dble *r,spinor_dble *s); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/global.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/global.h new file mode 100644 index 0000000000000000000000000000000000000000..d5b73d14d7bf6e828fcb6ec12587c62817e975d2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/global.h @@ -0,0 +1,81 @@ + +/******************************************************************************* +* +* File global.h +* +* Copyright (C) 2009, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Global parameters and arrays +* +*******************************************************************************/ + +#ifndef GLOBAL_H +#define GLOBAL_H + +#include "../../global.defs" + +#define NPROC0_BLK 1 +#define NPROC1_BLK 1 +#define NPROC2_BLK 1 +#define NPROC3_BLK 1 + +#define NAME_SIZE 128 + +/****************************** do not change *********************************/ + +#if ((NPROC0<1)||(NPROC1<1)||(NPROC2<1)||(NPROC3<1)|| \ + ((NPROC0>1)&&((NPROC0%2)!=0))||((NPROC1>1)&&((NPROC1%2)!=0))|| \ + ((NPROC2>1)&&((NPROC2%2)!=0))||((NPROC3>1)&&((NPROC3%2)!=0))) +#error : The number of processes in each direction must be 1 or a multiple of 2 +#endif + +#if ((L0<4)||(L1<4)||(L2<4)||(L3<4)|| \ + ((L0%2)!=0)||((L1%2)!=0)||((L2%2)!=0)||((L3%2)!=0)) +#error : The local lattice sizes must be even and not smaller than 4 +#endif + +#if ((NPROC0_BLK<1)||(NBROC0_BLK>NPROC0)||((NPROC0%NPROC0_BLK)!=0)|| \ + (NPROC1_BLK<1)||(NBROC1_BLK>NPROC1)||((NPROC1%NPROC1_BLK)!=0)|| \ + (NPROC2_BLK<1)||(NBROC2_BLK>NPROC2)||((NPROC2%NPROC2_BLK)!=0)|| \ + (NPROC3_BLK<1)||(NBROC3_BLK>NPROC3)||((NPROC3%NPROC3_BLK)!=0)) +#error : Improper processor block sizes NPROC0_BLK,..,NPROC3_BLK +#endif + +#if (NAME_SIZE<128) +#error : NAME_SIZE must be greater or equal to 128 +#endif + +#define NPROC (NPROC0*NPROC1*NPROC2*NPROC3) +#define VOLUME (L0*L1*L2*L3) +#define FACE0 ((1-(NPROC0%2))*L1*L2*L3) +#define FACE1 ((1-(NPROC1%2))*L2*L3*L0) +#define FACE2 ((1-(NPROC2%2))*L3*L0*L1) +#define FACE3 ((1-(NPROC3%2))*L0*L1*L2) +#define BNDRY (2*(FACE0+FACE1+FACE2+FACE3)) +#define NSPIN (VOLUME+(BNDRY/2)) +#define ALIGN 6 + +#ifndef SU3_H +#include "su3.h" +#endif + +#if defined MAIN_PROGRAM + #define EXTERN +#else + #define EXTERN extern +#endif + +EXTERN int cpr[4]; +EXTERN int npr[8]; + +EXTERN int ipt[VOLUME]; +EXTERN int iup[VOLUME][4]; +EXTERN int idn[VOLUME][4]; +EXTERN int map[BNDRY+NPROC%2]; + +#undef EXTERN + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/lattice.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/lattice.h new file mode 100644 index 0000000000000000000000000000000000000000..0073ef4e9784ec5f976c198fa6bb16967a11be80 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/lattice.h @@ -0,0 +1,61 @@ + +/******************************************************************************* +* +* File lattice.h +* +* Copyright (C) 2011, 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef LATTICE_H +#define LATTICE_H + +#ifndef BLOCK_H +#include "block.h" +#endif + +typedef struct +{ + int nu0,nuk; + int *iu0,*iuk; +} uidx_t; + +typedef struct +{ + int nft[2]; + int *ift[2]; +} ftidx_t; + +/* BCNDS_C */ +extern int *bnd_lks(int *n); +extern int *bnd_pts(int *n); +extern void set_bc(void); +extern int check_bc(double tol); +extern int chs_ubnd(int ibc); +extern void bnd_s2zero(ptset_t set,spinor *s); +extern void bnd_sd2zero(ptset_t set,spinor_dble *sd); + +/* FTIDX_C */ +extern ftidx_t *ftidx(void); +extern void plaq_ftidx(int n,int ix,int *ip); + +/* GEOMETRY_C */ +extern int ipr_global(int *n); +extern void ipt_global(int *x,int *ip,int *ix); +extern int global_time(int ix); +extern void geometry(void); +#if ((defined GEOMETRY_C)||(defined BLOCK_C)) +extern void blk_geometry(block_t *b); +extern void blk_imbed(block_t *b); +extern void bnd_geometry(block_t *b); +extern void bnd_imbed(block_t*b); +#endif + +/* UIDX_C */ +extern uidx_t *uidx(void); +extern void plaq_uidx(int n,int ix,int *ip); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/linalg.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/linalg.h new file mode 100644 index 0000000000000000000000000000000000000000..b961c2d86a801c62d094a125a4e9cf918959d7a5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/linalg.h @@ -0,0 +1,105 @@ + +/******************************************************************************* +* +* File linalg.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef LINALG_H +#define LINALG_H + +#ifndef SU3_H +#include "su3.h" +#endif + +/* CMATRIX_C */ +extern void cmat_vec(int n,complex *a,complex *v,complex *w); +extern void cmat_vec_assign(int n,complex *a,complex *v,complex *w); +extern void cmat_add(int n,complex *a,complex *b,complex *c); +extern void cmat_sub(int n,complex *a,complex *b,complex *c); +extern void cmat_mul(int n,complex *a,complex *b,complex *c); +extern void cmat_dag(int n,complex *a,complex *b); + +/* CMATRIX_DBLE_C */ +extern void cmat_vec_dble(int n,complex_dble *a,complex_dble *v, + complex_dble *w); +extern void cmat_vec_assign_dble(int n,complex_dble *a,complex_dble *v, + complex_dble *w); +extern void cmat_add_dble(int n,complex_dble *a,complex_dble *b, + complex_dble *c); +extern void cmat_sub_dble(int n,complex_dble *a,complex_dble *b, + complex_dble *c); +extern void cmat_mul_dble(int n,complex_dble *a,complex_dble *b, + complex_dble *c); +extern void cmat_dag_dble(int n,complex_dble *a,complex_dble *b); +extern int cmat_inv_dble(int n,complex_dble *a,complex_dble *b,double *k); + +/* LIEALG_C */ +extern void random_alg(int vol,su3_alg_dble *X); +extern double norm_square_alg(int vol,int icom,su3_alg_dble *X); +extern double scalar_prod_alg(int vol,int icom,su3_alg_dble *X,su3_alg_dble *Y); +extern void set_alg2zero(int vol,su3_alg_dble *X); +extern void set_ualg2zero(int vol,u3_alg_dble *X); +extern void assign_alg2alg(int vol,su3_alg_dble *X,su3_alg_dble *Y); +extern void swap_alg(int vol,su3_alg_dble *X,su3_alg_dble *Y); +extern void muladd_assign_alg(int vol,double r,su3_alg_dble *X,su3_alg_dble *Y); + +/* SALG_C */ +extern complex spinor_prod(int vol,int icom,spinor *s,spinor *r); +extern float spinor_prod_re(int vol,int icom,spinor *s,spinor *r); +extern float norm_square(int vol,int icom,spinor *s); +extern void mulc_spinor_add(int vol,spinor *s,spinor *r,complex z); +extern void mulr_spinor_add(int vol,spinor *s,spinor *r,float c); +extern void project(int vol,int icom,spinor *s,spinor *r); +extern void scale(int vol,float c,spinor *s); +extern float normalize(int vol,int icom,spinor *s); +extern void rotate(int vol,int n,spinor **ppk,complex *v); +extern void mulg5(int vol,spinor *s); +extern void mulmg5(int vol,spinor *s); + +/* SALG_DBLE_C */ +extern complex_dble spinor_prod_dble(int vol,int icom,spinor_dble *s, + spinor_dble *r); +extern double spinor_prod_re_dble(int vol,int icom,spinor_dble *s, + spinor_dble *r); +extern complex_dble spinor_prod5_dble(int vol,int icom,spinor_dble *s, + spinor_dble *r); +extern double norm_square_dble(int vol,int icom,spinor_dble *s); +extern void mulc_spinor_add_dble(int vol,spinor_dble *s,spinor_dble *r, + complex_dble z); +extern void mulr_spinor_add_dble(int vol,spinor_dble *s,spinor_dble *r, + double c); +extern void combine_spinor_dble(int vol,spinor_dble *s,spinor_dble *r, + double cs,double cr); +extern void project_dble(int vol,int icom,spinor_dble *s,spinor_dble *r); +extern void scale_dble(int vol,double c,spinor_dble *s); +extern double normalize_dble(int vol,int icom,spinor_dble *s); +extern void rotate_dble(int vol,int n,spinor_dble **ppk,complex_dble *v); +extern void mulg5_dble(int vol,spinor_dble *s); +extern void mulmg5_dble(int vol,spinor_dble *s); + +/* VALG_C */ +extern complex vprod(int n,int icom,complex *v,complex *w); +extern float vnorm_square(int n,int icom,complex *v); +extern void mulc_vadd(int n,complex *v,complex *w,complex z); +extern void vproject(int n,int icom,complex *v,complex *w); +extern void vscale(int n,float r,complex *v); +extern float vnormalize(int n,int icom,complex *v); +extern void vrotate(int n,int nv,complex **pv,complex *a); + +/* VALG_DBLE_C */ +extern complex_dble vprod_dble(int n,int icom,complex_dble *v,complex_dble *w); +extern double vnorm_square_dble(int n,int icom,complex_dble *v); +extern void mulc_vadd_dble(int n,complex_dble *v,complex_dble *w, + complex_dble z); +extern void vproject_dble(int n,int icom,complex_dble *v,complex_dble *w); +extern void vscale_dble(int n,double r,complex_dble *v); +extern double vnormalize_dble(int n,int icom,complex_dble *v); +extern void vrotate_dble(int n,int nv,complex_dble **pv,complex_dble *a); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/linsolv.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/linsolv.h new file mode 100644 index 0000000000000000000000000000000000000000..cde535c958abf51c700577a6310283b36ad443b5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/linsolv.h @@ -0,0 +1,46 @@ + +/******************************************************************************* +* +* File linsolv.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef LINSOLV_H +#define LINSOLV_H + +#ifndef SU3_H +#include "su3.h" +#endif + +/* CGNE_C */ +extern double cgne(int vol,int icom,void (*Dop)(spinor *s,spinor *r), + void (*Dop_dble)(spinor_dble *s,spinor_dble *r), + spinor **ws,spinor_dble **wsd,int nmx,double res, + spinor_dble *eta,spinor_dble *psi,int *status); + +/* FGCR4VD_C */ +extern double fgcr4vd(int vol,int icom, + void (*Dop)(complex_dble *v,complex_dble *w), + void (*Mop)(int k,complex *eta,complex *psi,complex *chi), + complex **wv,complex_dble **wvd,int nkv,int nmx,double res, + complex_dble *eta,complex_dble *psi,int *status); + +/* FGCR_C */ +extern double fgcr(int vol,int icom, + void (*Dop)(spinor_dble *s,spinor_dble *r), + void (*Mop)(int k,spinor *rho,spinor *phi,spinor *chi), + spinor **ws,spinor_dble **wsd,int nkv,int nmx,double res, + spinor_dble *eta,spinor_dble *psi,int *status); + +/* MSCG_C */ +extern void mscg(int vol,int icom,int nmu,double *mu, + void (*Dop_dble)(double mu,spinor_dble *s,spinor_dble *r), + spinor_dble **wsd,int nmx,double *res, + spinor_dble *eta,spinor_dble **psi,int *status); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/little.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/little.h new file mode 100644 index 0000000000000000000000000000000000000000..16918d3b3b785ae426d94aca1691cd3576344c54 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/little.h @@ -0,0 +1,83 @@ + +/******************************************************************************* +* +* File little.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef LITTLE_H +#define LITTLE_H + +#ifndef SU3_H +#include "su3.h" +#endif + +typedef struct +{ + int Ns,nb; + complex **Aee,**Aoo,**Aoe,**Aeo; +} Aw_t; + +typedef struct +{ + int Ns,nb; + complex_dble **Aee,**Aoo,**Aoe,**Aeo; +} Aw_dble_t; + +typedef struct +{ + int n[2]; + int vol,ibn; + spinor_dble **sde[2]; + spinor_dble **sdo[2]; +} b2b_flds_t; + +/* AW_COM_C */ +extern b2b_flds_t *b2b_flds(int n,int mu); +extern void cpAoe_ext_bnd(void); +extern void cpAee_int_bnd(void); + +/* AW_C */ +extern void Aw(complex *v,complex *w); +extern void Aweeinv(complex *v,complex *w); +extern void Awooinv(complex *v,complex *w); +extern void Awoe(complex *v,complex *w); +extern void Aweo(complex *v,complex *w); +extern void Awhat(complex *v,complex *w); + +/* AW_DBLE_C */ +extern void Aw_dble(complex_dble *v,complex_dble *w); +extern void Aweeinv_dble(complex_dble *v,complex_dble *w); +extern void Awooinv_dble(complex_dble *v,complex_dble *w); +extern void Awoe_dble(complex_dble *v,complex_dble *w); +extern void Aweo_dble(complex_dble *v,complex_dble *w); +extern void Awhat_dble(complex_dble *v,complex_dble *w); + +/* AW_GEN_C */ +extern void gather_ud(int vol,int *imb,su3_dble *ud,su3_dble *vd); +extern void gather_sd(int vol,int *imb,spinor_dble *sd,spinor_dble *rd); +extern void apply_u2sd(int vol,int *imb,su3_dble *ud,spinor_dble *sd, + spinor_dble *rd); +extern void apply_udag2sd(int vol,int *imb,su3_dble *ud,spinor_dble *sd, + spinor_dble *rd); +extern void (*spinor_prod_gamma[])(int vol,spinor_dble *sd,spinor_dble *rd, + complex_dble *sp); + +/* AW_OPS_C */ +extern Aw_t Awop(void); +extern Aw_t Awophat(void); +extern Aw_dble_t Awop_dble(void); +extern Aw_dble_t Awophat_dble(void); +extern void set_Aw(double mu); +extern int set_Awhat(double mu); + +/* LTL_MODES_C */ +extern int set_ltl_modes(void); +extern complex_dble *ltl_matrix(void); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/mdflds.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/mdflds.h new file mode 100644 index 0000000000000000000000000000000000000000..7820439d885f30bd2f2cdcaf2e954f2494fd4c9f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/mdflds.h @@ -0,0 +1,38 @@ + +/******************************************************************************* +* +* File mdflds.h +* +* Copyright (C) 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef MDFLDS_H +#define MDFLDS_H + +#ifndef SU3_H +#include "su3.h" +#endif + +typedef struct +{ + int npf; + su3_alg_dble *mom,*frc; + spinor_dble **pf; +} mdflds_t; + +/* FCOM_C */ +extern void copy_bnd_frc(void); +extern void add_bnd_frc(void); + +/* MDFLDS_C */ +extern mdflds_t *mdflds(void); +extern void set_frc2zero(void); +extern void bnd_mom2zero(void); +extern void random_mom(void); +extern double momentum_action(int icom); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/nompi/extras.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/nompi/extras.h new file mode 100644 index 0000000000000000000000000000000000000000..ef6abb6e1d14587a797921900e5e1f23bad7fbf4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/nompi/extras.h @@ -0,0 +1,53 @@ + +/******************************************************************************* +* +* File nompi/extras.h +* +* Copyright (C) 2009, 2010, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef EXTRAS_H +#define EXTRAS_H + +/* CHEBYSHEV_C */ +extern int cheby_fit(double a,double b,double (*f)(double x), + int dmax,double eps,double c[]); +extern double cheby_int(double a,double b,double (*f)(double x), + int dmax,double eps); +extern double cheby_val(double a,double b,int n,double c[],double x); + +/* FSOLVE_C */ +extern double inverse_fct(double x1,double x2,double (*f)(double x),double y, + double omega1,double omega2); +extern double minimize_fct(double x0,double x1,double x2,double (*f)(double x), + double omega1,double omega2); +extern void powell(int n,double *x0,double *x1,double *x2, + double (*f)(int n,double *x),int imx,double omega1, + double omega2,double *xmin,int *status); + +/* I0M_C */ +extern double i0m(double x); + +/* KS_TEST_C */ +extern void ks_test(int n,double f[],double *pkp,double *pkm); +extern void ks_prob(int n,double kp,double km,double *pp,double *pm); + +/* PCHI_SQUARE_C */ +extern double pchi_square(double chi_square,int nu); + +/* STAT_C */ +extern double average(int n,double *a); +extern double sigma0(int n,double *a); +extern double auto_corr(int n,double *a,int tmax,double *g); +extern void sigma_auto_corr(int n,double *a,int tmax,int lambda,double *eg); +extern double tauint(int n,double *a,int tmax,int lambda,int *w,double *sigma); +extern double print_auto(int n,double *a); +extern double jack_err(int nx,int n,double **a,double (*f)(int nx,double *x), + int bmax,double *sig); +extern double print_jack(int nx,int n,double **a,double (*f)(int nx,double *x)); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/nompi/utils.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/nompi/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..3e37c10ca625226ef856a386d620bdb6fea4e851 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/nompi/utils.h @@ -0,0 +1,79 @@ + +/******************************************************************************* +* +* File nompi/utils.h +* +* Copyright (C) 2009, 2010, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef UTILS_H +#define UTILS_H + +#include +#include + +#define NAME_SIZE 128 + +#if ((DBL_MANT_DIG!=53)||(DBL_MIN_EXP!=-1021)||(DBL_MAX_EXP!=1024)) +#error : Machine is not compliant with the IEEE-754 standard +#endif + +#if (SHRT_MAX==0x7fffffff) +typedef short int stdint_t; +typedef unsigned short int stduint_t; +#elif (INT_MAX==0x7fffffff) +typedef int stdint_t; +typedef unsigned int stduint_t; +#elif (LONG_MAX==0x7fffffff) +typedef long int stdint_t; +typedef unsigned long int stduint_t; +#else +#error : There is no four-byte integer type on this machine +#endif + +#undef UNKNOWN_ENDIAN +#undef LITTLE_ENDIAN +#undef BIG_ENDIAN + +#define UNKNOWN_ENDIAN 0 +#define LITTLE_ENDIAN 1 +#define BIG_ENDIAN 2 + +#undef IMAX +#define IMAX(n,m) ((n)+((m)-(n))*((m)>(n))) + +typedef enum +{ + ALL_PTS,EVEN_PTS,ODD_PTS,NO_PTS,PT_SETS +} ptset_t; + +/* ENDIAN_C */ +extern int endianness(void); +extern void bswap_int(int n,void *a); +extern void bswap_double(int n,void *a); + +/* MUTILS_C */ +extern int find_opt(int argc,char *argv[],char *opt); +extern int digits(double x,double dx,char *fmt); +extern int fdigits(double x); +extern int name_size(char *format,...); +extern long find_section(FILE *stream,char *title); +extern long read_line(FILE *stream,char *tag,char *format,...); +extern int count_tokens(FILE *stream,char *tag); +extern void read_iprms(FILE *stream,char *tag,int n,int *iprms); +extern void read_dprms(FILE *stream,char *tag,int n,double *dprms); + +/* UTILS_C */ +extern int safe_mod(int x,int y); +extern void *amalloc(size_t size,int p); +extern void afree(void *addr); +extern void error(int test,int no,char *name,char *format,...); +extern void error_root(int test,int no,char *name,char *format,...); +extern int error_loc(int test,int no,char *name,char *format,...); +extern void message(char *format,...); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/qpx.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/qpx.h new file mode 100644 index 0000000000000000000000000000000000000000..8894e0eb29a4e49016401d0bc78c921687fea7a3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/qpx.h @@ -0,0 +1,291 @@ +#ifndef QPX_H +#define QPX_H +/******************************************************************************* +* +* File qpx.h +* +* Copyright (C) 2013 Dalibor Djukanovic +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Type definitions and macros for fast manipulation of +* SU(3) matrices, SU(3) vectors and Dirac spinors exploiting the Quad FPU +* unit of BlueGene/Q +* +*******************************************************************************/ + +#ifdef PDToolkit +/* Needed for parsing with TAU */ +typedef double vector4double[4]; +#endif + +static vector4double qpx_r1, qpx_r2, qpx_r3, qpx_r4, qpx_r5, qpx_r6, qpx_r7, qpx_r8, qpx_r9; +static vector4double qpx_r10, qpx_r11, qpx_r12, qpx_r13, qpx_r14, qpx_r15, qpx_r16, qpx_r17; +static vector4double vec_i=(vector4double){0,1,0,1}; +static vector4double vec_i_s=(vector4double){0,1,0,-1}; +static vector4double sign0=(vector4double){1.,1.,1.,-1.}; +static vector4double sign1=(vector4double){1.,1.,-1.,-1.}; +static vector4double sign2=(vector4double){-1.,-1.,1.,1.}; + +/* Operands for qvfperm QRT QRA QRB QRC + QRC[msw] double QRC[12:14] QRT + 0x4000 0000 = 2.0 000 = 0 QRA0 + 0x4002 0000 = 2.25 001 = 1 QRA1 + 0x4004 0000 = 2.50 010 = 2 QRA2 + 0x4006 0000 = 2.75 011 = 3 QRA3 + 0x4008 0000 = 3.00 100 = 4 QRB0 + 0x400a 0000 = 3.25 101 = 5 QRB1 + 0x400c 0000 = 3.50 110 = 6 QRB2 + 0x400e 0000 = 3.75 111 = 7 QRB3 +*/ +static vector4double perm0011={2.000000,2.000000,2.250000,2.250000}; /* A0 A0 A1 A1 */ +static vector4double perm2233={2.500000,2.500000,2.750000,2.750000}; /* A2 A2 A3 A3 */ +static vector4double perm1={2.000000,2.250000,3.000000,3.250000}; /* A0 A1 B0 B1 */ +static vector4double perm2={2.500000,2.750000,3.500000,3.750000}; /* A2 A3 B2 B3 */ +static vector4double perml1={2.000000,3.00000,2.250000,3.000000}; /* A0 B0 A1 B0 */ +static vector4double perml2={2.500000,3.00000,2.750000,3.000000}; /* A2 B0 A3 B0 */ +static vector4double perm12={2.000000,2.250000,3.500000,3.750000}; /* A0 A1 B2 B3 */ +static vector4double perm21={3.500000,3.750000,2.000000,2.250000}; /* B2 B3 A0 A1 */ + +/* Prefetch */ + +#define _qpx_prefetch_su3_dp(addr)\ + __dcbt(((char*)((unsigned long int)(addr)))); \ + __dcbt(((char*)((unsigned long int)(addr)))+128); + +#define _qpx_prefetch_spinor_dp(addr)\ + __dcbt(((char*)((unsigned long int)(addr)))); \ + __dcbt(((char*)((unsigned long int)(addr)))+128); + +#define _qpx_prefetch_su3_sp(addr)\ + __dcbt(((char*)((unsigned long int)(addr)))); + +#define _qpx_prefetch_spinor_sp(addr)\ + __dcbt(((char*)((unsigned long int)(addr)))); + +/* Load and Store + + Asssume 32 Byte alignment for double precision structures + (spinor_dble, weyl_dble, and su3_dble) and 16 Byte alignment + for single-precision structures (spinor, weyl, su3) + + Use vec_lda and vec_sta to raise exception (SIG 7) + in case of incorrect alignment (if environment variable + BG_MAXALIGNEXP is set to a small value, e.g. 1) +*/ + +/* Load first Weyl spinor = components (c1, c2) of Dirac spinor: + psi11 <- c1.c1.re c1.c1.im c1.c2.re c1.c2.im + psi11 <- c1.c3.re c1.c3.im c2.c1.re c2.c1.im + psi11 <- c2.c2.re c2.c2.im c2.c3.re c2.c3.im +*/ +#define _qpx_load_w1(r,ps)\ + r##1=vec_lda(0,&(((ps)->c1).c1.re)); \ + r##2=vec_lda(0,&(((ps)->c1).c3.re)); \ + r##3=vec_lda(0,&(((ps)->c2).c2.re)); + +/* Load second Weyl spinor = components (c3, c4) of Dirac spinor: + psi11 <- c3.c1.re c3.c1.im c3.c2.re c3.c2.im + psi11 <- c3.c3.re c3.c3.im c4.c1.re c4.c1.im + psi11 <- c4.c2.re c4.c2.im c4.c3.re c4.c3.im +*/ +#define _qpx_load_w2(r,ps)\ + r##1=vec_lda(0,&(((ps)->c3).c1.re)); \ + r##2=vec_lda(0,&(((ps)->c3).c3.re)); \ + r##3=vec_lda(0,&(((ps)->c4).c2.re)); + +#define _qpx_store_w1(r,ps)\ + vec_sta(r##1,0,&(((ps)->c1).c1.re));\ + vec_sta(r##2,0,&(((ps)->c1).c3.re));\ + vec_sta(r##3,0,&(((ps)->c2).c2.re)); + +#define _qpx_store_w2(r,ps)\ + vec_sta(r##1,0,&(((ps)->c3).c1.re));\ + vec_sta(r##2,0,&(((ps)->c3).c3.re));\ + vec_sta(r##3,0,&(((ps)->c4).c2.re)); + +/* Permutation for Dirac Operators + res1 = ( v2.2 v2.3 v3.0 v3.1 ) + res2 = ( v3.2 v2.3 v1.0 v1.1 ) + res3 = ( v1.2 v1.3 v2.0 v2.1 ) +*/ +#define _qpx_vec_x(res,v)\ + res##1=vec_sldw(v##2,v##3,2); \ + res##2=vec_sldw(v##3,v##1,2); \ + res##3=vec_sldw(v##1,v##2,2); + + +/********************** Math functions ********************/ + +/******************* res = va + vb ***********************/ +#define _qpx_vec_add(res, va,vb) \ + res##1=vec_add(va##1,vb##1); \ + res##2=vec_add(va##2,vb##2); \ + res##3=vec_add(va##3,vb##3); + +/********************************************************* + res1 = va1 + vb1 + res2 = va2 + ( +vb2.0 +vb2.1 -vb2.2 -vb2.3 ) + res3 = va3 - vb3 + + If the operands are + va1 = ( psi_1 psi_1 ) vb1 = ( psi_4 psi_4 ) + va2 = ( psi_1 psi_2 ) vb2 = ( psi_4 psi_3 ) + va3 = ( psi_2 psi_2 ) vb3 = ( psi_3 psi_3 ) + then + res1 = ( phi_1 phi_1 ) + res2 = ( phi_1 phi_2 ) + res3 = ( phi_2 phi_2 ) + where + phi_1 = psi_1 + psi_4 + phi_2 = psi_2 - psi_3 + is the spinor combination for mu=+2 of eq. (A.12) of doc/dirac.pdf +*/ +#define _qpx_vec_add_n(res, va,vb) \ + res##1=vec_add(va##1,vb##1); \ + res##2=vec_madd(vb##2,sign1,va##2); \ + res##3=vec_sub(va##3,vb##3); + + +/******************* res = va - vb ***********************/ +#define _qpx_vec_sub(res, va,vb) \ + res##1=vec_sub(va##1,vb##1); \ + res##2=vec_sub(va##2,vb##2); \ + res##3=vec_sub(va##3,vb##3); + + +/********************************************************* + res1 = va1 - vb1 + res2 = va2 + ( -vb2.0 -vb2.1 +vb2.2 +vb2.3 ) + res3 = va3 + vb3 + + If the operands are + va1 = ( psi_1 psi_1 ) vb1 = ( psi_4 psi_4 ) + va2 = ( psi_1 psi_2 ) vb2 = ( psi_4 psi_3 ) + va3 = ( psi_2 psi_2 ) vb3 = ( psi_3 psi_3 ) + then + res1 = ( phi_1 phi_1 ) + res2 = ( phi_1 phi_2 ) + res3 = ( phi_2 phi_2 ) + where + phi_1 = psi_1 - psi_4 + phi_2 = psi_2 + psi_3 + is the spinor combination for mu=-2 of eq. (A.13) of doc/dirac.pdf +*/ +#define _qpx_vec_sub_n(res, va,vb) \ + res##1=vec_sub(va##1,vb##1); \ + res##2=vec_madd(sign2,vb##2,va##2); \ + res##3=vec_add(va##3,vb##3); + + +/******************* res = va - i vb **********************/ +#define _qpx_vec_i_sub(res,va,vb) \ + res##1=vec_xxcpnmadd(vb##1,vec_i,va##1);\ + res##2=vec_xxcpnmadd(vb##2,vec_i,va##2);\ + res##3=vec_xxcpnmadd(vb##3,vec_i,va##3); + +/********************************************************* + res1 = va1 - i vb1 + res2 = va2 + ( -i vb2.0, -i vb2.1, +i vb2.2, +i vb2.3 ) + res3 = va3 + i vb3 +*/ +#define _qpx_vec_i_sub_n(res,va,vb) \ + res##1=vec_xxcpnmadd(vb##1,vec_i,va##1);\ + res##2=vec_xxcpnmadd(vb##2,vec_i_s,va##2);\ + res##3=vec_xxnpmadd(vb##3,vec_i,va##3); + +/******************* res = va + i vb **********************/ +#define _qpx_vec_i_add(res,va,vb)\ + res##1=vec_xxnpmadd(vb##1,vec_i,va##1); \ + res##2=vec_xxnpmadd(vb##2,vec_i,va##2); \ + res##3=vec_xxnpmadd(vb##3,vec_i,va##3); + +/********************************************************* + res1 = va1 + i vb1 + res2 = va2 + ( +i vb2.0, +i vb2.1, -i vb2.2, -i vb2.3 ) + res3 = va3 - i vb3 +*/ +#define _qpx_vec_i_add_n(res,va,vb)\ + res##1=vec_xxnpmadd(vb##1,vec_i,va##1); \ + res##2=vec_xxnpmadd(vb##2,vec_i_s,va##2); \ + res##3=vec_xxcpnmadd(vb##3,vec_i,va##3); + +#define _qpx_su3_mul(res,u,psi) \ + qpx_r1=vec_ld2(0,&((u).c11.re)); \ + qpx_r2=vec_ld2(0,&((u).c21.re)); \ + qpx_r3=vec_ld2(0,&((u).c31.re)); \ + qpx_r4=vec_ld2(0,&((u).c12.re)); \ + qpx_r5=vec_ld2(0,&((u).c22.re)); \ + qpx_r6=vec_ld2(0,&((u).c32.re)); \ + qpx_r7=vec_ld2(0,&((u).c13.re)); \ + qpx_r8=vec_ld2(0,&((u).c23.re)); \ + qpx_r9=vec_ld2(0,&((u).c33.re)); \ + qpx_r10=vec_perm(psi##1,psi##2,perm12);\ + qpx_r11=vec_sldw(psi##1,psi##3,2);\ + qpx_r12=vec_perm(psi##2,psi##3,perm12);\ + qpx_r13=vec_xxnpmadd(qpx_r10,qpx_r1,vec_xmul(qpx_r1,qpx_r10));\ + qpx_r14=vec_xxnpmadd(qpx_r11,qpx_r4,vec_xmadd(qpx_r4,qpx_r11,qpx_r13));\ + qpx_r15=vec_xxnpmadd(qpx_r12,qpx_r7,vec_xmadd(qpx_r7,qpx_r12,qpx_r14));\ + qpx_r13=vec_xxnpmadd(qpx_r10,qpx_r2,vec_xmul(qpx_r2,qpx_r10));\ + qpx_r14=vec_xxnpmadd(qpx_r11,qpx_r5,vec_xmadd(qpx_r5,qpx_r11,qpx_r13));\ + qpx_r16=vec_xxnpmadd(qpx_r12,qpx_r8,vec_xmadd(qpx_r8,qpx_r12,qpx_r14));\ + qpx_r13=vec_xxnpmadd(qpx_r10,qpx_r3,vec_xmul(qpx_r3,qpx_r10));\ + qpx_r14=vec_xxnpmadd(qpx_r11,qpx_r6,vec_xmadd(qpx_r6,qpx_r11,qpx_r13));\ + qpx_r17=vec_xxnpmadd(qpx_r12,qpx_r9,vec_xmadd(qpx_r9,qpx_r12,qpx_r14));\ + res##1=vec_perm(qpx_r15,qpx_r16,perm1);\ + res##2=vec_perm(qpx_r17,qpx_r15,perm12);\ + res##3=vec_perm(qpx_r16,qpx_r17,perm2); + +#define _qpx_su3_inv_mul(res,u,psi) \ + qpx_r1=vec_ld2(0,&((u).c11.re)); \ + qpx_r2=vec_ld2(0,&((u).c12.re)); \ + qpx_r3=vec_ld2(0,&((u).c13.re)); \ + qpx_r4=vec_ld2(0,&((u).c21.re)); \ + qpx_r5=vec_ld2(0,&((u).c22.re)); \ + qpx_r6=vec_ld2(0,&((u).c23.re)); \ + qpx_r7=vec_ld2(0,&((u).c31.re)); \ + qpx_r8=vec_ld2(0,&((u).c32.re)); \ + qpx_r9=vec_ld2(0,&((u).c33.re)); \ + qpx_r10=vec_perm(psi##1,psi##2,perm12);\ + qpx_r11=vec_sldw(psi##1,psi##3,2);\ + qpx_r12=vec_perm(psi##2,psi##3,perm12);\ + qpx_r13=vec_xxcpnmadd(qpx_r10,qpx_r1,vec_xmul(qpx_r1,qpx_r10));\ + qpx_r14=vec_xxcpnmadd(qpx_r11,qpx_r4,vec_xmadd(qpx_r4,qpx_r11,qpx_r13));\ + qpx_r15=vec_xxcpnmadd(qpx_r12,qpx_r7,vec_xmadd(qpx_r7,qpx_r12,qpx_r14));\ + qpx_r13=vec_xxcpnmadd(qpx_r10,qpx_r2,vec_xmul(qpx_r2,qpx_r10));\ + qpx_r14=vec_xxcpnmadd(qpx_r11,qpx_r5,vec_xmadd(qpx_r5,qpx_r11,qpx_r13));\ + qpx_r16=vec_xxcpnmadd(qpx_r12,qpx_r8,vec_xmadd(qpx_r8,qpx_r12,qpx_r14));\ + qpx_r13=vec_xxcpnmadd(qpx_r10,qpx_r3,vec_xmul(qpx_r3,qpx_r10));\ + qpx_r14=vec_xxcpnmadd(qpx_r11,qpx_r6,vec_xmadd(qpx_r6,qpx_r11,qpx_r13));\ + qpx_r17=vec_xxcpnmadd(qpx_r12,qpx_r9,vec_xmadd(qpx_r9,qpx_r12,qpx_r14));\ + res##1=vec_perm(qpx_r15,qpx_r16,perm1);\ + res##2=vec_perm(qpx_r17,qpx_r15,perm12);\ + res##3=vec_perm(qpx_r16,qpx_r17,perm2); + +#define _qpx_vec_i_add_assign(res,va) \ + res##1=vec_xxnpmadd(va##1,vec_i,res##1); \ + res##2=vec_xxnpmadd(va##2,vec_i,res##2); \ + res##3=vec_xxnpmadd(va##3,vec_i,res##3); + +#define _qpx_vec_add_assign(va,vb) \ + va##1=vec_add(va##1,vb##1); \ + va##2=vec_add(va##2,vb##2); \ + va##3=vec_add(va##3,vb##3); + +#define _qpx_vec_sub_assign(va,vb) \ + va##1=vec_sub(va##1,vb##1); \ + va##2=vec_sub(va##2,vb##2); \ + va##3=vec_sub(va##3,vb##3); + +#define _qpx_vec_i_sub_assign(res,va) \ + res##1=vec_xxcpnmadd(va##1,vec_i,res##1); \ + res##2=vec_xxcpnmadd(va##2,vec_i,res##2); \ + res##3=vec_xxcpnmadd(va##3,vec_i,res##3); + +#define _qpx_vec_prod(a,b,res)\ + res##1=vec_xxcpnmadd(b##1,a##1,vec_xmadd(a##1,b##1,res##1));\ + res##2=vec_xxcpnmadd(b##2,a##2,vec_xmadd(a##2,b##2,res##2));\ + res##3=vec_xxcpnmadd(b##3,a##3,vec_xmadd(a##3,b##3,res##3)); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/random.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/random.h new file mode 100644 index 0000000000000000000000000000000000000000..191ee57b93128208cccd8f6785c9301b32bfd730 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/random.h @@ -0,0 +1,39 @@ + +/******************************************************************************* +* +* File random.h +* +* Copyright (C) 2005, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef RANDOM_H +#define RANDOM_H + +/* GAUSS_C */ +extern void gauss(float r[],int n); +extern void gauss_dble(double r[],int n); + +/* RANLUX_C */ +extern void start_ranlux(int level,int seed); +extern void export_ranlux(int tag,char *out); +extern int import_ranlux(char *in); + +/* RANLXS_C */ +extern void ranlxs(float r[],int n); +extern void rlxs_init(int level,int seed); +extern int rlxs_size(void); +extern void rlxs_get(int state[]); +extern void rlxs_reset(int state[]); + +/* RANLXD_C */ +extern void ranlxd(double r[],int n); +extern void rlxd_init(int level,int seed); +extern int rlxd_size(void); +extern void rlxd_get(int state[]); +extern void rlxd_reset(int state[]); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/ratfcts.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/ratfcts.h new file mode 100644 index 0000000000000000000000000000000000000000..882551ab17ec33b97f263ea0533543d248ea08a6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/ratfcts.h @@ -0,0 +1,34 @@ + +/******************************************************************************* +* +* File ratfcts.h +* +* Copyright (C) 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef RATFCTS_H +#define RATFCTS_H + +typedef struct +{ + int np; + double A,delta; + double *mu,*rmu; + double *nu,*rnu; +} ratfct_t; + +/* ELLIPTIC_C */ +extern double ellipticK(double rk); +extern void sncndn(double u,double rk,double *sn,double *cn,double *dn); + +/* RATFCTS_C */ +extern ratfct_t ratfct(int *irat); + +/* ZOLOTAREV_C */ +extern void zolotarev(int n,double eps,double *A,double *ar,double *delta); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sap.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sap.h new file mode 100644 index 0000000000000000000000000000000000000000..6a6efbab6f7d592bab74c1ab921d433cf524c37d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sap.h @@ -0,0 +1,37 @@ + +/******************************************************************************* +* +* File sap.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef SAP_H +#define SAP_H + +#ifndef SU3_H +#include "su3.h" +#endif + +/* BLK_SOLV_C */ +extern void blk_mres(int n,float mu,int nmr); +extern void blk_eo_mres(int n,float mu,int nmr); + +/* SAP_COM_C */ +#if ((defined SAP_COM_C)||(defined BLK_GRID_C )) +extern void alloc_sap_bufs(void); +#endif +extern void sap_com(int ic,spinor *r); + +/* SAP */ +extern void sap(float mu,int isolv,int nmr,spinor *psi,spinor *eta); + +/* SAP_GCR */ +extern double sap_gcr(int nkv,int nmx,double res,double mu, + spinor_dble *eta,spinor_dble *psi,int *status); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sflds.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sflds.h new file mode 100644 index 0000000000000000000000000000000000000000..03905e271e40de745b5008504cb475122b53b846 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sflds.h @@ -0,0 +1,59 @@ + +/******************************************************************************* +* +* File sflds.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef SFLDS_H +#define SFLDS_H + +#ifndef SU3_H +#include "su3.h" +#endif + +/* PBND_C */ +extern void (*assign_s2w[8])(int *imb,int vol,spinor *s,weyl *r); +extern void (*add_assign_w2s[8])(int *imb,int vol,weyl *s,spinor *r); +extern void (*sub_assign_w2s[8])(int *imb,int vol,weyl *s,spinor *r); +extern void (*mulg5_sub_assign_w2s[8])(int *imb,int vol,weyl *s,spinor *r); + +/* PBND_DBLE_C */ +extern void (*assign_sd2wd[8])(int *imb,int vol,spinor_dble *sd, + weyl_dble *rd); +extern void (*add_assign_wd2sd[8])(int *imb,int vol,weyl_dble *sd, + spinor_dble *rd); +extern void (*sub_assign_wd2sd[8])(int *imb,int vol,weyl_dble *sd, + spinor_dble *rd); +extern void (*mulg5_sub_assign_wd2sd[8])(int *imb,int vol,weyl_dble *sd, + spinor_dble *rd); + +/* SFLDS_C */ +extern void set_s2zero(int vol,spinor *s); +extern void set_sd2zero(int vol,spinor_dble *sd); +extern void random_s(int vol,spinor *s,float sigma); +extern void random_sd(int vol,spinor_dble *sd,double sigma); +extern void assign_s2s(int vol,spinor *s,spinor *r); +extern void assign_s2sd(int vol,spinor *s,spinor_dble *rd); +extern void assign_sd2s(int vol,spinor_dble *sd,spinor *r); +extern void assign_sd2sd(int vol,spinor_dble *sd,spinor_dble *rd); +extern void diff_s2s(int vol,spinor *s,spinor *r); +extern void add_s2sd(int vol,spinor *s,spinor_dble *rd); +extern void diff_sd2s(int vol,spinor_dble *sd,spinor_dble *rd,spinor *r); + +/* SCOM_C */ +extern void cps_int_bnd(int is,spinor *s); +extern void cps_ext_bnd(int is,spinor *s); + +/* SDCOM_C */ +extern void cpsd_int_bnd(int is,spinor_dble *sd); +extern void cpsd_ext_bnd(int is,spinor_dble *sd); + +#endif + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sse.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sse.h new file mode 100644 index 0000000000000000000000000000000000000000..20ed66b5b8c27aa940ccb5c9e75c26dccf3f3499 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sse.h @@ -0,0 +1,1356 @@ + +/******************************************************************************* +* +* File sse.h +* +* Copyright (C) 2005, 2008, 2009, 2011 Martin Luescher, Filippo Palombi +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Macros for Dirac spinors, SU(3) vectors and SU(3) matrices using inline +* assembly SSE3 instructions. The machine is assumed to comply with the +* x86-64 instruction set. +* +*******************************************************************************/ + +#ifndef SSE_H +#define SSE_H + +typedef struct +{ + int c1,c2,c3,c4; +} sse_int __attribute__ ((aligned (16))); + +typedef struct +{ + float c1,c2,c3,c4; +} sse_float __attribute__ ((aligned (16))); + +typedef struct +{ + sse_float c1,c2,c3; +} sse_vector __attribute__ ((aligned (16))); + +static sse_float _sse_sgn12 __attribute__ ((unused)) ={-1.0f,-1.0f,1.0f,1.0f}; +static sse_float _sse_sgn13 __attribute__ ((unused)) ={-1.0f,1.0f,-1.0f,1.0f}; +static sse_float _sse_sgn14 __attribute__ ((unused)) ={-1.0f,1.0f,1.0f,-1.0f}; +static sse_float _sse_sgn23 __attribute__ ((unused)) ={1.0f,-1.0f,-1.0f,1.0f}; +static sse_float _sse_sgn24 __attribute__ ((unused)) ={1.0f,-1.0f,1.0f,-1.0f}; +static sse_float _sse_sgn34 __attribute__ ((unused)) ={1.0f,1.0f,-1.0f,-1.0f}; +static sse_float _sse_sgn __attribute__ ((unused)) ={-1.0f,-1.0f,-1.0f,-1.0f}; + +/******************************************************************************* +* +* Prefetch macros +* +*******************************************************************************/ + +#if (defined P4) + +#define _pfbase(addr) ((unsigned long)(addr)&(~0x7fL)) + +#define _prefetch_128b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr))))) + +#define _prefetch_256b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)))), \ + "m" (*((char*)(_pfbase(addr)+0x80L)))) + +#define _prefetch_384b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)))), \ + "m" (*((char*)(_pfbase(addr)+0x80L))), \ + "m" (*((char*)(_pfbase(addr)+0x100L)))) + +#define _prefetch_su3_alg_dble(addr) \ +_prefetch_128b((addr)) + +#define _prefetch_weyl(addr) \ +_prefetch_256b((addr)) + +#define _prefetch_spinor(addr) \ +_prefetch_256b((addr)) + +#define _prefetch_su3(addr) \ +_prefetch_256b((addr)) + +#define _prefetch_pauli(addr) \ +_prefetch_256b((addr)) + +#define _prefetch_weyl_dble(addr) \ +_prefetch_256b((addr)) + +#define _prefetch_spinor_dble(addr) \ +_prefetch_256b((addr)) + +#define _prefetch_su3_dble(addr) \ +_prefetch_256b((addr)) + +#define _prefetch_pauli_dble(addr) \ +_prefetch_384b((addr)) + +#elif (defined PM) + +#define _pfbase(addr) ((unsigned long)(addr)&(~0x3fL)) + +#define _prefetch_64b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr))))) + +#define _prefetch_128b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)))), \ + "m" (*((char*)(_pfbase(addr)+0x40L)))) + +#define _prefetch_192b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)))), \ + "m" (*((char*)(_pfbase(addr)+0x40L))), \ + "m" (*((char*)(_pfbase(addr)+0x80L)))) + +#define _prefetch_320b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2 \n\t" \ + "prefetcht0 %3 \n\t" \ + "prefetcht0 %4" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)))), \ + "m" (*((char*)(_pfbase(addr)+0x40L))), \ + "m" (*((char*)(_pfbase(addr)+0x80L))), \ + "m" (*((char*)(_pfbase(addr)+0xc0L))), \ + "m" (*((char*)(_pfbase(addr)+0x100L)))) + +#define _prefetch_su3_alg_dble(addr) \ +_prefetch_64b((addr)) + +#define _prefetch_weyl(addr) \ +_prefetch_64b((addr)) + +#define _prefetch_spinor(addr) \ +_prefetch_128b((addr)) + +#define _prefetch_su3(addr) \ +_prefetch_128b((addr)) + +#define _prefetch_pauli(addr) \ +_prefetch_192b((addr)) + +#define _prefetch_weyl_dble(addr) \ +_prefetch_128b((addr)) + +#define _prefetch_spinor_dble(addr) \ +_prefetch_192b((addr)) + +#define _prefetch_su3_dble(addr) \ +_prefetch_192b((addr)) + +#define _prefetch_pauli_dble(addr) \ +_prefetch_320b((addr)) + +#elif (defined P3) + +#define _pfbase(addr) ((unsigned long)(addr)&(~0x1fL)) + +#define _prefetch_64b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)))), \ + "m" (*((char*)(_pfbase(addr)+0x20L)))) + +#define _prefetch_96b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)))), \ + "m" (*((char*)(_pfbase(addr)+0x20L))), \ + "m" (*((char*)(_pfbase(addr)+0x40L)))) + +#define _prefetch_160b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2 \n\t" \ + "prefetcht0 %3 \n\t" \ + "prefetcht0 %4" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)))), \ + "m" (*((char*)(_pfbase(addr)+0x20L))), \ + "m" (*((char*)(_pfbase(addr)+0x40L))), \ + "m" (*((char*)(_pfbase(addr)+0x60L))), \ + "m" (*((char*)(_pfbase(addr)+0x80L)))) + +#define _prefetch_192b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2 \n\t" \ + "prefetcht0 %3 \n\t" \ + "prefetcht0 %4 \n\t" \ + "prefetcht0 %5" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)))), \ + "m" (*((char*)(_pfbase(addr)+0x20L))), \ + "m" (*((char*)(_pfbase(addr)+0x40L))), \ + "m" (*((char*)(_pfbase(addr)+0x60L))), \ + "m" (*((char*)(_pfbase(addr)+0x80L))), \ + "m" (*((char*)(_pfbase(addr)+0xa0L)))) + +#define _prefetch_288b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2 \n\t" \ + "prefetcht0 %3 \n\t" \ + "prefetcht0 %4" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)))), \ + "m" (*((char*)(_pfbase(addr)+0x20L))), \ + "m" (*((char*)(_pfbase(addr)+0x40L))), \ + "m" (*((char*)(_pfbase(addr)+0x60L))), \ + "m" (*((char*)(_pfbase(addr)+0x80L)))); \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2 \n\t" \ + "prefetcht0 %3" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)+0xa0L))), \ + "m" (*((char*)(_pfbase(addr)+0xc0L))), \ + "m" (*((char*)(_pfbase(addr)+0xe0L))), \ + "m" (*((char*)(_pfbase(addr)+0x100L)))) + +#define _prefetch_su3_alg_dble(addr) \ +_prefetch_64b((addr)) + +#define _prefetch_weyl(addr) \ +_prefetch_64b((addr)) + +#define _prefetch_spinor(addr) \ +_prefetch_96b((addr)) + +#define _prefetch_su3(addr) \ +_prefetch_96b((addr)) + +#define _prefetch_pauli(addr) \ +_prefetch_160b((addr)) + +#define _prefetch_weyl_dble(addr) \ +_prefetch_96b((addr)) + +#define _prefetch_spinor_dble(addr) \ +_prefetch_192b((addr)) + +#define _prefetch_su3_dble(addr) \ +_prefetch_160b((addr)) + +#define _prefetch_pauli_dble(addr) \ +_prefetch_288b((addr)) + +#else + +#define _prefetch_su3_alg_dble(addr) + +#define _prefetch_weyl(addr) + +#define _prefetch_spinor(addr) + +#define _prefetch_su3(addr) + +#define _prefetch_pauli(addr) + +#define _prefetch_weyl_dble(addr) + +#define _prefetch_spinor_dble(addr) + +#define _prefetch_su3_dble(addr) + +#define _prefetch_pauli_dble(addr) + +#endif + +/******************************************************************************* +* +* Macros for su3_vector data +* +* Most of these macros operate on pairs of su3 vectors that are stored +* in the low and high words of xmm0,xmm1,xmm2 or xmm3,xmm4,xmm5. For example, +* +* xmm0 -> sl.c1.re,sl.c1.im,sh.c1.re,sh.c1.im +* xmm1 -> sl.c2.re,sl.c2.im,sh.c2.re,sh.c2.im +* xmm2 -> sl.c3.re,sl.c3.im,sh.c3.re,sh.c3.im +* +* (where sl and sh are of type su3_vector). This can also be interpreted as +* an sse_vector s that is stored in these registers according to +* +* xmm0 -> s.c1.c1,s.c1.c2,s.c1.c3,s.c1.c4 +* xmm1 -> s.c2.c1,s.c2.c2,s.c2.c3,s.c2.c4 +* xmm2 -> s.c3.c1,s.c3.c2,s.c3.c3,s.c3.c4 +* +* The load and store macros can be used to move data in either format +* from and to the xmm registers +* +*******************************************************************************/ + +/* +* Loads two su3 vectors sl and sh to the low and high words of xmm0,xmm1,xmm2 +*/ + +#define _sse_pair_load(sl,sh) \ +__asm__ __volatile__ ("movsd %0, %%xmm0 \n\t" \ + "movsd %1, %%xmm1 \n\t" \ + "movsd %2, %%xmm2 \n\t" \ + "movhps %3, %%xmm0 \n\t" \ + "movhps %4, %%xmm1 \n\t" \ + "movhps %5, %%xmm2" \ + : \ + : \ + "m" ((sl).c1), \ + "m" ((sl).c2), \ + "m" ((sl).c3), \ + "m" ((sh).c1), \ + "m" ((sh).c2), \ + "m" ((sh).c3) \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Loads two su3 vectors sl and sh to the low and high words of xmm3,xmm4,xmm5 +*/ + +#define _sse_pair_load_up(sl,sh) \ +__asm__ __volatile__ ("movsd %0, %%xmm3 \n\t" \ + "movsd %1, %%xmm4 \n\t" \ + "movsd %2, %%xmm5 \n\t" \ + "movhps %3, %%xmm3 \n\t" \ + "movhps %4, %%xmm4 \n\t" \ + "movhps %5, %%xmm5" \ + : \ + : \ + "m" ((sl).c1), \ + "m" ((sl).c2), \ + "m" ((sl).c3), \ + "m" ((sh).c1), \ + "m" ((sh).c2), \ + "m" ((sh).c3) \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Stores the low and high words of xmm0,xmm1,xmm2 to the su3 vectors rl and rh +*/ + +#define _sse_pair_store(rl,rh) \ +__asm__ __volatile__ ("movlps %%xmm0, %0 \n\t" \ + "movlps %%xmm1, %1 \n\t" \ + "movlps %%xmm2, %2 \n\t" \ + "movhps %%xmm0, %3 \n\t" \ + "movhps %%xmm1, %4 \n\t" \ + "movhps %%xmm2, %5" \ + : \ + "=m" ((rl).c1), \ + "=m" ((rl).c2), \ + "=m" ((rl).c3), \ + "=m" ((rh).c1), \ + "=m" ((rh).c2), \ + "=m" ((rh).c3)) + +/* +* Stores the low and high words of xmm3,xmm4,xmm5 to the su3 vectors rl and rh +*/ + +#define _sse_pair_store_up(rl,rh) \ +__asm__ __volatile__ ("movlps %%xmm3, %0 \n\t" \ + "movlps %%xmm4, %1 \n\t" \ + "movlps %%xmm5, %2 \n\t" \ + "movhps %%xmm3, %3 \n\t" \ + "movhps %%xmm4, %4 \n\t" \ + "movhps %%xmm5, %5" \ + : \ + "=m" ((rl).c1), \ + "=m" ((rl).c2), \ + "=m" ((rl).c3), \ + "=m" ((rh).c1), \ + "=m" ((rh).c2), \ + "=m" ((rh).c3)) + +/* +* Loads the components of a Weyl spinor s to xmm0,xmm1,xmm2 +*/ + +#define _sse_weyl_load(s) \ +__asm__ __volatile__ ("movaps %0, %%xmm0 \n\t" \ + "movaps %2, %%xmm1 \n\t" \ + "movaps %4, %%xmm2" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Loads the components of a Weyl spinor s to xmm3,xmm4,xmm5 +*/ + +#define _sse_weyl_load_up(s) \ +__asm__ __volatile__ ("movaps %0, %%xmm3 \n\t" \ + "movaps %2, %%xmm4 \n\t" \ + "movaps %4, %%xmm5" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Stores xmm0,xmm1,xmm2 to the components of a Weyl spinor s +*/ + +#define _sse_weyl_store(s) \ +__asm__ __volatile__ ("movaps %%xmm0, %0 \n\t" \ + "movaps %%xmm1, %2 \n\t" \ + "movaps %%xmm2, %4" \ + : \ + "=m" ((s).c1.c1), \ + "=m" ((s).c1.c2), \ + "=m" ((s).c1.c3), \ + "=m" ((s).c2.c1), \ + "=m" ((s).c2.c2), \ + "=m" ((s).c2.c3)) + +/* +* Stores xmm3,xmm4,xmm5 to the components of a Weyl spinor s +*/ + +#define _sse_weyl_store_up(s) \ +__asm__ __volatile__ ("movaps %%xmm3, %0 \n\t" \ + "movaps %%xmm4, %2 \n\t" \ + "movaps %%xmm5, %4" \ + : \ + "=m" ((s).c1.c1), \ + "=m" ((s).c1.c2), \ + "=m" ((s).c1.c3), \ + "=m" ((s).c2.c1), \ + "=m" ((s).c2.c2), \ + "=m" ((s).c2.c3)) + +/* +* Adds xmm3,xmm4,xmm5 to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_add() \ +__asm__ __volatile__ ("addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Subtracts xmm3,xmm4,xmm5 from xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_sub() \ +__asm__ __volatile__ ("subps %%xmm3, %%xmm0 \n\t" \ + "subps %%xmm4, %%xmm1 \n\t" \ + "subps %%xmm5, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Multiplies the high words xmm3,xmm4,xmm5 with -1 and adds these registers +* to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_addsub() \ +__asm__ __volatile__ ("mulps %0, %%xmm3 \n\t" \ + "mulps %0, %%xmm4 \n\t" \ + "mulps %0, %%xmm5 \n\t" \ + "addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn34) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies the low words xmm3,xmm4,xmm5 with -1 and adds these registers +* to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_subadd() \ +__asm__ __volatile__ ("mulps %0, %%xmm3 \n\t" \ + "mulps %0, %%xmm4 \n\t" \ + "mulps %0, %%xmm5 \n\t" \ + "addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn12) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies xmm3,xmm4,xmm5 with i and adds them to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_i_add() \ +__asm__ __volatile__ ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \ + "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \ + "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \ + "addsubps %%xmm3, %%xmm0 \n\t" \ + "addsubps %%xmm4, %%xmm1 \n\t" \ + "addsubps %%xmm5, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies xmm3,xmm4,xmm5 with i and subtracts them from xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_i_sub() \ +__asm__ __volatile__ ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \ + "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \ + "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \ + "mulps %0, %%xmm3 \n\t" \ + "mulps %0, %%xmm4 \n\t" \ + "mulps %0, %%xmm5 \n\t" \ + "addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn24) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Exchanges the high and low words of xmm3,xmm4,xmm5, multiplies them with i +* and adds the result to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_xch_i_add() \ +__asm__ __volatile__ ("shufps $0x1b, %%xmm3, %%xmm3 \n\t" \ + "shufps $0x1b, %%xmm4, %%xmm4 \n\t" \ + "shufps $0x1b, %%xmm5, %%xmm5 \n\t" \ + "addsubps %%xmm3, %%xmm0 \n\t" \ + "addsubps %%xmm4, %%xmm1 \n\t" \ + "addsubps %%xmm5, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Exchanges the high and low words of xmm3,xmm4,xmm5, multiplies them with i +* and subtracts the result from xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_xch_i_sub() \ +__asm__ __volatile__ ("shufps $0x1b, %%xmm3, %%xmm3 \n\t" \ + "shufps $0x1b, %%xmm4, %%xmm4 \n\t" \ + "shufps $0x1b, %%xmm5, %%xmm5 \n\t" \ + "mulps %0, %%xmm3 \n\t" \ + "mulps %0, %%xmm4 \n\t" \ + "mulps %0, %%xmm5 \n\t" \ + "addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn24) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies the low and high words of xmm3,xmm4,xmm5 with i and -i +* respectively and adds these registers to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_i_addsub() \ +__asm__ __volatile__ ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \ + "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \ + "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \ + "mulps %0, %%xmm3 \n\t" \ + "mulps %0, %%xmm4 \n\t" \ + "mulps %0, %%xmm5 \n\t" \ + "addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn14) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies the low and high words of xmm3,xmm4,xmm5 with -i and i +* respectively and adds these registers to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_i_subadd() \ +__asm__ __volatile__ ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \ + "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \ + "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \ + "mulps %0, %%xmm3 \n\t" \ + "mulps %0, %%xmm4 \n\t" \ + "mulps %0, %%xmm5 \n\t" \ + "addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn23) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Exchanges the high and low words in xmm3,xmm4,xmm5 +*/ + +#define _sse_vector_xch() \ +__asm__ __volatile__ ("shufps $0x4e, %%xmm3, %%xmm3 \n\t" \ + "shufps $0x4e, %%xmm4, %%xmm4 \n\t" \ + "shufps $0x4e, %%xmm5, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + +/****************************************************************************** +* +* Action of su3 matrices on su3 vectors +* +******************************************************************************/ + +/* +* Multiplies a pair sl,sh of su3 vectors with an su3 matrix u, +* assuming sl and sh are in the low and high words of xmm0,xmm1,xmm2 +* +* On output the result is in xmm3,xmm4,xmm5 and the registers +* xmm0,xmm1,xmm2 are changed +*/ + +#define _sse_su3_multiply(u) \ +__asm__ __volatile__ ("movss %0, %%xmm3 \n\t" \ + "movss %1, %%xmm6 \n\t" \ + "movss %2, %%xmm4 \n\t" \ + "movss %3, %%xmm7 \n\t" \ + "movss %4, %%xmm5 \n\t" \ + "movss %5, %%xmm8 \n\t" \ + "shufps $0x0, %%xmm3, %%xmm3 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm4, %%xmm4 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm5, %%xmm5 \n\t" \ + "shufps $0x0, %%xmm8, %%xmm8" \ + : \ + : \ + "m" ((u).c11.re), \ + "m" ((u).c12.re), \ + "m" ((u).c21.re), \ + "m" ((u).c22.re), \ + "m" ((u).c31.re), \ + "m" ((u).c32.re) \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("mulps %%xmm0, %%xmm3 \n\t" \ + "mulps %%xmm1, %%xmm6 \n\t" \ + "mulps %%xmm0, %%xmm4 \n\t" \ + "mulps %%xmm1, %%xmm7 \n\t" \ + "mulps %%xmm0, %%xmm5 \n\t" \ + "mulps %%xmm1, %%xmm8 \n\t" \ + "addps %%xmm6, %%xmm3 \n\t" \ + "addps %%xmm7, %%xmm4 \n\t" \ + "addps %%xmm8, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("movss %0, %%xmm9 \n\t" \ + "movss %1, %%xmm10 \n\t" \ + "movss %2, %%xmm11 \n\t" \ + "movss %3, %%xmm6 \n\t" \ + "movss %4, %%xmm7 \n\t" \ + "movss %5, %%xmm8 \n\t" \ + "shufps $0xb1, %%xmm0, %%xmm0 \n\t" \ + "shufps $0x0, %%xmm9, %%xmm9 \n\t" \ + "shufps $0x0, %%xmm10, %%xmm10 \n\t" \ + "shufps $0x0, %%xmm11, %%xmm11 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm8, %%xmm8" \ + : \ + : \ + "m" ((u).c13.re), \ + "m" ((u).c21.im), \ + "m" ((u).c33.re), \ + "m" ((u).c11.im), \ + "m" ((u).c23.re), \ + "m" ((u).c31.im) \ + : \ + "xmm0", "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("mulps %%xmm2, %%xmm9 \n\t" \ + "mulps %%xmm0, %%xmm10 \n\t" \ + "mulps %%xmm2, %%xmm11 \n\t" \ + "mulps %%xmm0, %%xmm6 \n\t" \ + "mulps %%xmm2, %%xmm7 \n\t" \ + "mulps %%xmm0, %%xmm8 \n\t" \ + "addps %%xmm9, %%xmm3 \n\t" \ + "addsubps %%xmm10, %%xmm4 \n\t" \ + "addps %%xmm11, %%xmm5 \n\t" \ + "addsubps %%xmm6, %%xmm3 \n\t" \ + "addps %%xmm7, %%xmm4 \n\t" \ + "addsubps %%xmm8, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("movss %0, %%xmm9 \n\t" \ + "movss %1, %%xmm10 \n\t" \ + "movss %2, %%xmm11 \n\t" \ + "movss %3, %%xmm6 \n\t" \ + "movss %4, %%xmm7 \n\t" \ + "movss %5, %%xmm8 \n\t" \ + "shufps $0xb1, %%xmm1, %%xmm1 \n\t" \ + "shufps $0xb1, %%xmm2, %%xmm2 \n\t" \ + "shufps $0x0, %%xmm9, %%xmm9 \n\t" \ + "shufps $0x0, %%xmm10, %%xmm10 \n\t" \ + "shufps $0x0, %%xmm11, %%xmm11 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm8, %%xmm8" \ + : \ + : \ + "m" ((u).c12.im), \ + "m" ((u).c23.im), \ + "m" ((u).c32.im), \ + "m" ((u).c13.im), \ + "m" ((u).c22.im), \ + "m" ((u).c33.im) \ + : \ + "xmm1", "xmm2", "xmm6", "xmm7", \ + "xmm8", "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("mulps %%xmm1, %%xmm9 \n\t" \ + "mulps %%xmm2, %%xmm10 \n\t" \ + "mulps %%xmm1, %%xmm11 \n\t" \ + "mulps %%xmm2, %%xmm6 \n\t" \ + "mulps %%xmm1, %%xmm7 \n\t" \ + "mulps %%xmm2, %%xmm8 \n\t" \ + "addsubps %%xmm9, %%xmm3 \n\t" \ + "addsubps %%xmm10, %%xmm4 \n\t" \ + "addsubps %%xmm11, %%xmm5 \n\t" \ + "addsubps %%xmm6, %%xmm3 \n\t" \ + "addsubps %%xmm7, %%xmm4 \n\t" \ + "addsubps %%xmm8, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11") + +/* +* Multiplies a pair sl,sh of su3 vectors with an su3 matrix u^dagger, +* assuming sl and sh are in the low and high words of xmm0,xmm1,xmm2 +* +* On output the result is in xmm3,xmm4,xmm5 and the registers +* xmm0,xmm1,xmm2 are changed +*/ + +#define _sse_su3_inverse_multiply(u) \ +__asm__ __volatile__ ("movss %0, %%xmm6 \n\t" \ + "movss %1, %%xmm9 \n\t" \ + "movss %2, %%xmm7 \n\t" \ + "movss %3, %%xmm10 \n\t" \ + "movss %4, %%xmm8 \n\t" \ + "movss %5, %%xmm11 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm9, %%xmm9 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm10, %%xmm10 \n\t" \ + "shufps $0x0, %%xmm8, %%xmm8 \n\t" \ + "shufps $0x0, %%xmm11, %%xmm11" \ + : \ + : \ + "m" ((u).c11.im), \ + "m" ((u).c21.im), \ + "m" ((u).c12.im), \ + "m" ((u).c22.im), \ + "m" ((u).c13.im), \ + "m" ((u).c23.im) \ + : \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("mulps %%xmm0, %%xmm6 \n\t" \ + "mulps %%xmm1, %%xmm9 \n\t" \ + "mulps %%xmm0, %%xmm7 \n\t" \ + "mulps %%xmm1, %%xmm10 \n\t" \ + "mulps %%xmm0, %%xmm8 \n\t" \ + "mulps %%xmm1, %%xmm11 \n\t" \ + "addps %%xmm6, %%xmm9 \n\t" \ + "addps %%xmm7, %%xmm10 \n\t" \ + "addps %%xmm8, %%xmm11" \ + : \ + : \ + : \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("movss %0, %%xmm3 \n\t" \ + "movss %1, %%xmm4 \n\t" \ + "movss %2, %%xmm5 \n\t" \ + "movss %3, %%xmm6 \n\t" \ + "movss %4, %%xmm7 \n\t" \ + "movss %5, %%xmm8 \n\t" \ + "shufps $0xb1, %%xmm0, %%xmm0 \n\t" \ + "shufps $0x0, %%xmm3, %%xmm3 \n\t" \ + "shufps $0x0, %%xmm4, %%xmm4 \n\t" \ + "shufps $0x0, %%xmm5, %%xmm5 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm8, %%xmm8" \ + : \ + : \ + "m" ((u).c11.re), \ + "m" ((u).c12.re), \ + "m" ((u).c13.re), \ + "m" ((u).c31.im), \ + "m" ((u).c32.im), \ + "m" ((u).c33.im) \ + : \ + "xmm0", "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("mulps %%xmm0, %%xmm3 \n\t" \ + "mulps %%xmm0, %%xmm4 \n\t" \ + "mulps %%xmm0, %%xmm5 \n\t" \ + "mulps %%xmm2, %%xmm6 \n\t" \ + "mulps %%xmm2, %%xmm7 \n\t" \ + "mulps %%xmm2, %%xmm8 \n\t" \ + "addsubps %%xmm9, %%xmm3 \n\t" \ + "addsubps %%xmm10, %%xmm4 \n\t" \ + "addsubps %%xmm11, %%xmm5 \n\t" \ + "addsubps %%xmm6, %%xmm3 \n\t" \ + "addsubps %%xmm7, %%xmm4 \n\t" \ + "addsubps %%xmm8, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("movss %0, %%xmm9 \n\t" \ + "movss %1, %%xmm10 \n\t" \ + "movss %2, %%xmm11 \n\t" \ + "movss %3, %%xmm6 \n\t" \ + "movss %4, %%xmm7 \n\t" \ + "movss %5, %%xmm8 \n\t" \ + "shufps $0xb1, %%xmm1, %%xmm1 \n\t" \ + "shufps $0xb1, %%xmm2, %%xmm2 \n\t" \ + "shufps $0x0, %%xmm9, %%xmm9 \n\t" \ + "shufps $0x0, %%xmm10, %%xmm10 \n\t" \ + "shufps $0x0, %%xmm11, %%xmm11 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm8, %%xmm8" \ + : \ + : \ + "m" ((u).c21.re), \ + "m" ((u).c32.re), \ + "m" ((u).c23.re), \ + "m" ((u).c31.re), \ + "m" ((u).c22.re), \ + "m" ((u).c33.re) \ + : \ + "xmm1", "xmm2", "xmm6", "xmm7", \ + "xmm8", "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("mulps %%xmm1, %%xmm9 \n\t" \ + "mulps %%xmm2, %%xmm10 \n\t" \ + "mulps %%xmm1, %%xmm11 \n\t" \ + "mulps %%xmm2, %%xmm6 \n\t" \ + "mulps %%xmm1, %%xmm7 \n\t" \ + "mulps %%xmm2, %%xmm8 \n\t" \ + "addps %%xmm9, %%xmm3 \n\t" \ + "addps %%xmm10, %%xmm4 \n\t" \ + "addps %%xmm11, %%xmm5 \n\t" \ + "addps %%xmm6, %%xmm3 \n\t" \ + "addps %%xmm7, %%xmm4 \n\t" \ + "addps %%xmm8, %%xmm5 \n\t" \ + "shufps $0xb1, %%xmm3, %%xmm3 \n\t" \ + "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \ + "shufps $0xb1, %%xmm5, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11") + +/****************************************************************************** +* +* Macros for Dirac spinors +* +******************************************************************************/ + +/* +* Loads the spinor s to the registers xmm0,..,xmm5 in linear order +*/ + +#define _sse_spinor_load(s) \ +__asm__ __volatile__ ("movaps %0, %%xmm0 \n\t" \ + "movaps %2, %%xmm1 \n\t" \ + "movaps %4, %%xmm2" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm0", "xmm1", "xmm2"); \ +__asm__ __volatile__ ("movaps %0, %%xmm3 \n\t" \ + "movaps %2, %%xmm4 \n\t" \ + "movaps %4, %%xmm5" \ + : \ + : \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2), \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Loads the spinor s to the registers xmm6,..,xmm11 in linear order +*/ + +#define _sse_spinor_load_up(s) \ +__asm__ __volatile__ ("movaps %0, %%xmm6 \n\t" \ + "movaps %2, %%xmm7 \n\t" \ + "movaps %4, %%xmm8" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("movaps %0, %%xmm9 \n\t" \ + "movaps %2, %%xmm10 \n\t" \ + "movaps %4, %%xmm11" \ + : \ + : \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2), \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm9", "xmm10", "xmm11") + +/* +* Stores the registers xmm0,..,xmm5 to the spinor s in linear order +*/ + +#define _sse_spinor_store(s) \ +__asm__ __volatile__ ("movaps %%xmm0, %0 \n\t" \ + "movaps %%xmm1, %2 \n\t" \ + "movaps %%xmm2, %4" \ + : \ + "=m" ((s).c1.c1), \ + "=m" ((s).c1.c2), \ + "=m" ((s).c1.c3), \ + "=m" ((s).c2.c1), \ + "=m" ((s).c2.c2), \ + "=m" ((s).c2.c3)); \ +__asm__ __volatile__ ("movaps %%xmm3, %0 \n\t" \ + "movaps %%xmm4, %2 \n\t" \ + "movaps %%xmm5, %4" \ + : \ + "=m" ((s).c3.c1), \ + "=m" ((s).c3.c2), \ + "=m" ((s).c3.c3), \ + "=m" ((s).c4.c1), \ + "=m" ((s).c4.c2), \ + "=m" ((s).c4.c3)) + +/* +* Stores the registers xmm6,..,xmm11 to the spinor s in linear order +*/ + +#define _sse_spinor_store_up(s) \ +__asm__ __volatile__ ("movaps %%xmm6, %0 \n\t" \ + "movaps %%xmm7, %2 \n\t" \ + "movaps %%xmm8, %4" \ + : \ + "=m" ((s).c1.c1), \ + "=m" ((s).c1.c2), \ + "=m" ((s).c1.c3), \ + "=m" ((s).c2.c1), \ + "=m" ((s).c2.c2), \ + "=m" ((s).c2.c3)); \ +__asm__ __volatile__ ("movaps %%xmm9, %0 \n\t" \ + "movaps %%xmm10, %2 \n\t" \ + "movaps %%xmm11, %4" \ + : \ + "=m" ((s).c3.c1), \ + "=m" ((s).c3.c2), \ + "=m" ((s).c3.c3), \ + "=m" ((s).c4.c1), \ + "=m" ((s).c4.c2), \ + "=m" ((s).c4.c3)) + +/* +* Loads (z.re,z.re,z.re,z.re) to xmm6 and (-z.im,z.im,-z.im,z.im) to xmm7 +*/ + +#define _sse_load_cmplx(z) \ +__asm__ __volatile__ ("movss %0, %%xmm6 \n\t" \ + "movss %1, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "mulps %2, %%xmm7" \ + : \ + : \ + "m" ((z).re), \ + "m" ((z).im), \ + "m" (_sse_sgn13) \ + : \ + "xmm6", "xmm7") + +/* +* Multiplies the spinor s by the complex number z and assigns the result to +* xmm0,..,xmm5, assuming z was loaded to xmm6,xmm7 using _sse_load_cmplx(z) +*/ + +#define _sse_mulc_spinor(s) \ +__asm__ __volatile__ ("movaps %0, %%xmm0 \n\t" \ + "movaps %2, %%xmm1 \n\t" \ + "movaps %4, %%xmm2" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm0", "xmm1", "xmm2"); \ +__asm__ __volatile__ ("movaps %%xmm0, %%xmm8 \n\t" \ + "movaps %%xmm1, %%xmm9 \n\t" \ + "movaps %%xmm2, %%xmm10 \n\t" \ + "mulps %%xmm6, %%xmm0 \n\t" \ + "mulps %%xmm6, %%xmm1 \n\t" \ + "mulps %%xmm6, %%xmm2 \n\t" \ + "shufps $0xb1, %%xmm8, %%xmm8 \n\t" \ + "shufps $0xb1, %%xmm9, %%xmm9 \n\t" \ + "shufps $0xb1, %%xmm10, %%xmm10 \n\t" \ + "mulps %%xmm7, %%xmm8 \n\t" \ + "mulps %%xmm7, %%xmm9 \n\t" \ + "mulps %%xmm7, %%xmm10 \n\t" \ + "addps %%xmm8, %%xmm0 \n\t" \ + "addps %%xmm9, %%xmm1 \n\t" \ + "addps %%xmm10, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm8", "xmm9", "xmm10"); \ +__asm__ __volatile__ ("movaps %0, %%xmm3 \n\t" \ + "movaps %2, %%xmm4 \n\t" \ + "movaps %4, %%xmm5" \ + : \ + : \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2), \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm3", "xmm4", "xmm5"); \ +__asm__ __volatile__ ("movaps %%xmm3, %%xmm11 \n\t" \ + "movaps %%xmm4, %%xmm12 \n\t" \ + "movaps %%xmm5, %%xmm13 \n\t" \ + "mulps %%xmm6, %%xmm3 \n\t" \ + "mulps %%xmm6, %%xmm4 \n\t" \ + "mulps %%xmm6, %%xmm5 \n\t" \ + "shufps $0xb1, %%xmm11, %%xmm11 \n\t" \ + "shufps $0xb1, %%xmm12, %%xmm12 \n\t" \ + "shufps $0xb1, %%xmm13, %%xmm13 \n\t" \ + "mulps %%xmm7, %%xmm11 \n\t" \ + "mulps %%xmm7, %%xmm12 \n\t" \ + "mulps %%xmm7, %%xmm13 \n\t" \ + "addps %%xmm11, %%xmm3 \n\t" \ + "addps %%xmm12, %%xmm4 \n\t" \ + "addps %%xmm13, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm11", "xmm12", "xmm13") + + +/* +* Multiplies the spinor s by the complex number z and adds the result to +* xmm0,..,xmm5, assuming z was loaded to xmm6,xmm7 using _sse_load_cmplx(z) +*/ + +#define _sse_mulc_spinor_add(s) \ +__asm__ __volatile__ ("movaps %0, %%xmm8 \n\t" \ + "movaps %2, %%xmm9 \n\t" \ + "movaps %4, %%xmm10" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm8", "xmm9", "xmm10"); \ +__asm__ __volatile__ ("movaps %%xmm8, %%xmm11 \n\t" \ + "movaps %%xmm9, %%xmm12 \n\t" \ + "movaps %%xmm10, %%xmm13 \n\t" \ + "mulps %%xmm6, %%xmm8 \n\t" \ + "mulps %%xmm6, %%xmm9 \n\t" \ + "mulps %%xmm6, %%xmm10 \n\t" \ + "shufps $0xb1, %%xmm11, %%xmm11 \n\t" \ + "shufps $0xb1, %%xmm12, %%xmm12 \n\t" \ + "shufps $0xb1, %%xmm13, %%xmm13 \n\t" \ + "addps %%xmm8, %%xmm0 \n\t" \ + "addps %%xmm9, %%xmm1 \n\t" \ + "addps %%xmm10, %%xmm2 \n\t" \ + "mulps %%xmm7, %%xmm11 \n\t" \ + "mulps %%xmm7, %%xmm12 \n\t" \ + "mulps %%xmm7, %%xmm13 \n\t" \ + "addps %%xmm11, %%xmm0 \n\t" \ + "addps %%xmm12, %%xmm1 \n\t" \ + "addps %%xmm13, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm8", "xmm9", "xmm10", \ + "xmm11", "xmm12", "xmm13"); \ +__asm__ __volatile__ ("movaps %0, %%xmm8 \n\t" \ + "movaps %2, %%xmm9 \n\t" \ + "movaps %4, %%xmm10" \ + : \ + : \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2), \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm8", "xmm9", "xmm10"); \ +__asm__ __volatile__ ("movaps %%xmm8, %%xmm11 \n\t" \ + "movaps %%xmm9, %%xmm12 \n\t" \ + "movaps %%xmm10, %%xmm13 \n\t" \ + "mulps %%xmm6, %%xmm8 \n\t" \ + "mulps %%xmm6, %%xmm9 \n\t" \ + "mulps %%xmm6, %%xmm10 \n\t" \ + "shufps $0xb1, %%xmm11, %%xmm11 \n\t" \ + "shufps $0xb1, %%xmm12, %%xmm12 \n\t" \ + "shufps $0xb1, %%xmm13, %%xmm13 \n\t" \ + "addps %%xmm8, %%xmm3 \n\t" \ + "addps %%xmm9, %%xmm4 \n\t" \ + "addps %%xmm10, %%xmm5 \n\t" \ + "mulps %%xmm7, %%xmm11 \n\t" \ + "mulps %%xmm7, %%xmm12 \n\t" \ + "mulps %%xmm7, %%xmm13 \n\t" \ + "addps %%xmm11, %%xmm3 \n\t" \ + "addps %%xmm12, %%xmm4 \n\t" \ + "addps %%xmm13, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm8", "xmm9", "xmm10", \ + "xmm11", "xmm12", "xmm13") + +/* +* Loads (c,c,c,c) to xmm6 and xmm7 +*/ + +#define _sse_load_real(c) \ +__asm__ __volatile__ ("movss %0, %%xmm6 \n\t" \ + "movss %0, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7" \ + : \ + : \ + "m" (c) \ + : \ + "xmm6", "xmm7") + +/* +* Multiplies the spinor s by the real number c and assigns the result to +* xmm0,..,xmm5, assuming c was loaded to xmm6,xmm7 using _sse_load_real(c) +*/ + +#define _sse_mulr_spinor(s) \ +__asm__ __volatile__ ("movaps %0, %%xmm0 \n\t" \ + "movaps %2, %%xmm1 \n\t" \ + "movaps %4, %%xmm2" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm0", "xmm1", "xmm2"); \ +__asm__ __volatile__ ("mulps %%xmm6, %%xmm0 \n\t" \ + "mulps %%xmm7, %%xmm1 \n\t" \ + "mulps %%xmm6, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2"); \ +__asm__ __volatile__ ("movaps %0, %%xmm3 \n\t" \ + "movaps %2, %%xmm4 \n\t" \ + "movaps %4, %%xmm5" \ + : \ + : \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2), \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm3", "xmm4", "xmm5"); \ +__asm__ __volatile__ ("mulps %%xmm7, %%xmm3 \n\t" \ + "mulps %%xmm6, %%xmm4 \n\t" \ + "mulps %%xmm7, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies the spinor s by the real number c and adds the result to +* xmm0,..,xmm5, assuming c was loaded to xmm6,xmm7 using _sse_load_real(c) +*/ + +#define _sse_mulr_spinor_add(s) \ +__asm__ __volatile__ ("movaps %0, %%xmm8 \n\t" \ + "movaps %2, %%xmm9 \n\t" \ + "movaps %4, %%xmm10" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm8", "xmm9", "xmm10"); \ +__asm__ __volatile__ ("mulps %%xmm6, %%xmm8 \n\t" \ + "mulps %%xmm7, %%xmm9 \n\t" \ + "mulps %%xmm6, %%xmm10 \n\t" \ + "addps %%xmm8, %%xmm0 \n\t" \ + "addps %%xmm9, %%xmm1 \n\t" \ + "addps %%xmm10, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm8", "xmm9", "xmm10"); \ +__asm__ __volatile__ ("movaps %0, %%xmm11 \n\t" \ + "movaps %2, %%xmm12 \n\t" \ + "movaps %4, %%xmm13" \ + : \ + : \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2), \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm11", "xmm12", "xmm13"); \ +__asm__ __volatile__ ("mulps %%xmm7, %%xmm11 \n\t" \ + "mulps %%xmm6, %%xmm12 \n\t" \ + "mulps %%xmm7, %%xmm13 \n\t" \ + "addps %%xmm11, %%xmm3 \n\t" \ + "addps %%xmm12, %%xmm4 \n\t" \ + "addps %%xmm13, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm11", "xmm12", "xmm13") + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sse2.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sse2.h new file mode 100644 index 0000000000000000000000000000000000000000..a0c7fb168ad71ff55102b16e83e0cf8f75e4080e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sse2.h @@ -0,0 +1,659 @@ + +/******************************************************************************* +* +* File sse2.h +* +* Copyright (C) 2005, 2008, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Macros for Dirac spinors, SU(3) vectors and SU(3) matrices using inline +* assembly SSE3 instructions. The machine is assumed to comply with the +* x86-64 instruction set. +* +*******************************************************************************/ + +#ifndef SSE2_H +#define SSE2_H + +#ifndef SSE_H +#include "sse.h" +#endif + +typedef struct +{ + double c1,c2; +} sse_double __attribute__ ((aligned (16))); + +static sse_double _sse_sgn1_dble __attribute__ ((unused)) ={-1.0,1.0}; +static sse_double _sse_sgn2_dble __attribute__ ((unused)) ={1.0,-1.0}; +static sse_double _sse_sgn_dble __attribute__ ((unused)) ={-1.0,-1.0}; + +/******************************************************************************* +* +* Macros for double-precision su3 vectors +* +* Most of these macros operate on su3 vectors that are stored +* in xmm0,xmm1,xmm2 or xmm3,xmm4,xmm5. For example, +* +* xmm0 -> s.c1.re,s.c1.im +* xmm1 -> s.c2.re,s.c2.im +* xmm2 -> s.c3.re,s.c3.im +* +* where s is of type su3_vector_dble +* +*******************************************************************************/ + +/* +* Loads an su3 vector s to xmm0,xmm1,xmm2 +*/ + +#define _sse_load_dble(s) \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((s).c1), \ + "m" ((s).c2), \ + "m" ((s).c3) \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Loads an su3 vector s to xmm3,xmm4,xmm5 +*/ + +#define _sse_load_up_dble(s) \ +__asm__ __volatile__ ("movapd %0, %%xmm3 \n\t" \ + "movapd %1, %%xmm4 \n\t" \ + "movapd %2, %%xmm5" \ + : \ + : \ + "m" ((s).c1), \ + "m" ((s).c2), \ + "m" ((s).c3) \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Stores xmm0,xmm1,xmm2 to the components r.c1,r.c2,r.c3 of an su3 vector +*/ + +#define _sse_store_dble(r) \ +__asm__ __volatile__ ("movapd %%xmm0, %0 \n\t" \ + "movapd %%xmm1, %1 \n\t" \ + "movapd %%xmm2, %2" \ + : \ + "=m" ((r).c1), \ + "=m" ((r).c2), \ + "=m" ((r).c3)) + +/* +* Stores xmm3,xmm4,xmm5 to the components r.c1,r.c2,r.c3 of an su3 vector +*/ + +#define _sse_store_up_dble(r) \ +__asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" \ + "movapd %%xmm4, %1 \n\t" \ + "movapd %%xmm5, %2" \ + : \ + "=m" ((r).c1), \ + "=m" ((r).c2), \ + "=m" ((r).c3)) + +/* +* Multiplies xmm0,xmm1,xmm2 with a constant sse_double c +*/ + +#define _sse_vector_mul_dble(c) \ +__asm__ __volatile__ ("mulpd %0, %%xmm0 \n\t" \ + "mulpd %0, %%xmm1 \n\t" \ + "mulpd %0, %%xmm2" \ + : \ + : \ + "m" (c) \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Multiplies xmm3,xmm4,xmm5 with a constant sse_double c +*/ + +#define _sse_vector_mul_up_dble(c) \ +__asm__ __volatile__ ("mulpd %0, %%xmm3 \n\t" \ + "mulpd %0, %%xmm4 \n\t" \ + "mulpd %0, %%xmm5" \ + : \ + : \ + "m" (c) \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Adds xmm3,xmm4,xmm5 to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_add_dble() \ +__asm__ __volatile__ ("addpd %%xmm3, %%xmm0 \n\t" \ + "addpd %%xmm4, %%xmm1 \n\t" \ + "addpd %%xmm5, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Subtracts xmm3,xmm4,xmm5 from xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_sub_dble() \ +__asm__ __volatile__ ("subpd %%xmm3, %%xmm0 \n\t" \ + "subpd %%xmm4, %%xmm1 \n\t" \ + "subpd %%xmm5, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Multiplies xmm3,xmm4,xmm5 with i +*/ + +#define _sse_vector_i_mul_dble() \ +__asm__ __volatile__ ("shufpd $0x1, %%xmm3, %%xmm3 \n\t" \ + "shufpd $0x1, %%xmm4, %%xmm4 \n\t" \ + "shufpd $0x1, %%xmm5, %%xmm5 \n\t" \ + "mulpd %0, %%xmm3 \n\t" \ + "mulpd %0, %%xmm4 \n\t" \ + "mulpd %0, %%xmm5" \ + : \ + : \ + "m" (_sse_sgn1_dble) \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies xmm3,xmm4,xmm5 with i and adds them to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_i_add_dble() \ +__asm__ __volatile__ ("shufpd $0x1, %%xmm3, %%xmm3 \n\t" \ + "shufpd $0x1, %%xmm4, %%xmm4 \n\t" \ + "shufpd $0x1, %%xmm5, %%xmm5 \n\t" \ + "addsubpd %%xmm3, %%xmm0 \n\t" \ + "addsubpd %%xmm4, %%xmm1 \n\t" \ + "addsubpd %%xmm5, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Loads (z.re,z.re) to xmm6 and (-z.im,z.im) to xmm7 +*/ + +#define _sse_load_cmplx_dble(z) \ +__asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" \ + "movddup %1, %%xmm7 \n\t" \ + "mulpd %2, %%xmm7" \ + : \ + : \ + "m" ((z).re), \ + "m" ((z).im), \ + "m" (_sse_sgn1_dble) \ + : \ + "xmm6", "xmm7") + +/* +* Multiplies the complex numbers in xmm0,xmm1,xmm2 by z, assuming z has +* been loaded to xmm6,xmm7 by _sse_load_cmplx_dble(z). The result appears +* in xmm0,xmm1,xmm2 and xmm3,xmm4,xmm5,xmm6,xmm7 are unchanged +*/ + +#define _sse_mulc_vector_dble() \ +__asm__ __volatile__ ("movapd %%xmm0, %%xmm8 \n\t" \ + "movapd %%xmm1, %%xmm9 \n\t" \ + "movapd %%xmm2, %%xmm10 \n\t" \ + "mulpd %%xmm6, %%xmm0 \n\t" \ + "mulpd %%xmm6, %%xmm1 \n\t" \ + "mulpd %%xmm6, %%xmm2 \n\t" \ + "shufpd $0x1, %%xmm8, %%xmm8 \n\t" \ + "shufpd $0x1, %%xmm9, %%xmm9 \n\t" \ + "shufpd $0x1, %%xmm10, %%xmm10 \n\t" \ + "mulpd %%xmm7, %%xmm8 \n\t" \ + "mulpd %%xmm7, %%xmm9 \n\t" \ + "mulpd %%xmm7, %%xmm10 \n\t" \ + "addpd %%xmm8, %%xmm0 \n\t" \ + "addpd %%xmm9, %%xmm1 \n\t" \ + "addpd %%xmm10, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm8", "xmm9", "xmm10") + +/* +* Multiplies the complex numbers in xmm3,xmm4,xmm5 by z, assuming z has +* been loaded to xmm6,xmm7 by _sse_load_cmplx_dble(z). The result appears +* in xmm3,xmm4,xmm5 and xmm0,xmm1,xmm2,xmm6,xmm7 are unchanged +*/ + +#define _sse_mulc_vector_up_dble() \ +__asm__ __volatile__ ("movapd %%xmm3, %%xmm8 \n\t" \ + "movapd %%xmm4, %%xmm9 \n\t" \ + "movapd %%xmm5, %%xmm10 \n\t" \ + "mulpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %%xmm6, %%xmm4 \n\t" \ + "mulpd %%xmm6, %%xmm5 \n\t" \ + "shufpd $0x1, %%xmm8, %%xmm8 \n\t" \ + "shufpd $0x1, %%xmm9, %%xmm9 \n\t" \ + "shufpd $0x1, %%xmm10, %%xmm10 \n\t" \ + "mulpd %%xmm7, %%xmm8 \n\t" \ + "mulpd %%xmm7, %%xmm9 \n\t" \ + "mulpd %%xmm7, %%xmm10 \n\t" \ + "addpd %%xmm8, %%xmm3 \n\t" \ + "addpd %%xmm9, %%xmm4 \n\t" \ + "addpd %%xmm10, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm8", "xmm9", "xmm10") + +/* +* Computes s+z*r assuming s is stored in xmm0,xmm1,xmm2 and that z +* has been loaded to xmm6,xmm7 by _sse_load_cmplx_dble(z). The result +* appears in xmm0,xmm1,xmm2 and xmm3,xmm4,xmm5,xmm6,xmm7 are unchanged +*/ + +#define _sse_mulc_vector_add_dble(r) \ +__asm__ __volatile__ ("movapd %0, %%xmm8 \n\t" \ + "movapd %1, %%xmm9 \n\t" \ + "movapd %2, %%xmm10 \n\t" \ + "movapd %%xmm8, %%xmm11 \n\t" \ + "movapd %%xmm9, %%xmm12 \n\t" \ + "movapd %%xmm10, %%xmm13" \ + : \ + : \ + "m" ((r).c1), \ + "m" ((r).c2), \ + "m" ((r).c3) \ + : \ + "xmm8", "xmm9", "xmm10", \ + "xmm11", "xmm12", "xmm13"); \ +__asm__ __volatile__ ("mulpd %%xmm6, %%xmm8 \n\t" \ + "mulpd %%xmm6, %%xmm9 \n\t" \ + "mulpd %%xmm6, %%xmm10 \n\t" \ + "shufpd $0x1, %%xmm11, %%xmm11 \n\t" \ + "shufpd $0x1, %%xmm12, %%xmm12 \n\t" \ + "shufpd $0x1, %%xmm13, %%xmm13 \n\t" \ + "addpd %%xmm8, %%xmm0 \n\t" \ + "addpd %%xmm9, %%xmm1 \n\t" \ + "addpd %%xmm10, %%xmm2 \n\t" \ + "mulpd %%xmm7, %%xmm11 \n\t" \ + "mulpd %%xmm7, %%xmm12 \n\t" \ + "mulpd %%xmm7, %%xmm13 \n\t" \ + "addpd %%xmm11, %%xmm0 \n\t" \ + "addpd %%xmm12, %%xmm1 \n\t" \ + "addpd %%xmm13, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm8", "xmm9", "xmm10", \ + "xmm11", "xmm12", "xmm13") + +/* +* Computes s+z*r assuming s is stored in xmm3,xmm4,xmm5 and that z +* has been loaded to xmm6,xmm7 by _sse_load_cmplx_dble(z). The result +* appears in xmm4,xmm5,xmm6 and xmm0,xmm1,xmm2,xmm6,xmm7 are unchanged +*/ + +#define _sse_mulc_vector_add_up_dble(r) \ +__asm__ __volatile__ ("movapd %0, %%xmm8 \n\t" \ + "movapd %1, %%xmm9 \n\t" \ + "movapd %2, %%xmm10 \n\t" \ + "movapd %%xmm8, %%xmm11 \n\t" \ + "movapd %%xmm9, %%xmm12 \n\t" \ + "movapd %%xmm10, %%xmm13" \ + : \ + : \ + "m" ((r).c1), \ + "m" ((r).c2), \ + "m" ((r).c3) \ + : \ + "xmm8", "xmm9", "xmm10", \ + "xmm11", "xmm12", "xmm13"); \ +__asm__ __volatile__ ("mulpd %%xmm6, %%xmm8 \n\t" \ + "mulpd %%xmm6, %%xmm9 \n\t" \ + "mulpd %%xmm6, %%xmm10 \n\t" \ + "shufpd $0x1, %%xmm11, %%xmm11 \n\t" \ + "shufpd $0x1, %%xmm12, %%xmm12 \n\t" \ + "shufpd $0x1, %%xmm13, %%xmm13 \n\t" \ + "addpd %%xmm8, %%xmm3 \n\t" \ + "addpd %%xmm9, %%xmm4 \n\t" \ + "addpd %%xmm10, %%xmm5 \n\t" \ + "mulpd %%xmm7, %%xmm11 \n\t" \ + "mulpd %%xmm7, %%xmm12 \n\t" \ + "mulpd %%xmm7, %%xmm13 \n\t" \ + "addpd %%xmm11, %%xmm3 \n\t" \ + "addpd %%xmm12, %%xmm4 \n\t" \ + "addpd %%xmm13, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm8", "xmm9", "xmm10", \ + "xmm11", "xmm12", "xmm13") + +/* +* Loads (c,c) to xmm6 and xmm7 +*/ + +#define _sse_load_real_dble(c) \ +__asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" \ + "movddup %0, %%xmm7" \ + : \ + : \ + "m" (c) \ + : \ + "xmm6", "xmm7") + +/* +* Multiplies the complex numbers in xmm0,xmm1,xmm2 by c, assuming c has +* been loaded to xmm6,xmm7 by _sse_load_real_dble(c). The result appears +* in xmm0,xmm1,xmm2 all other xmm registers are unchanged +*/ + +#define _sse_mulr_vector_dble() \ +__asm__ __volatile__ ("mulpd %%xmm6, %%xmm0 \n\t" \ + "mulpd %%xmm7, %%xmm1 \n\t" \ + "mulpd %%xmm6, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Multiplies the complex numbers in xmm3,xmm4,xmm5 by c, assuming c has +* been loaded to xmm6,xmm7 by _sse_load_real_dble(z). The result appears +* in xmm3,xmm4,xmm5 all other xmm registers are unchanged +*/ + +#define _sse_mulr_vector_up_dble() \ +__asm__ __volatile__ ("mulpd %%xmm7, %%xmm3 \n\t" \ + "mulpd %%xmm6, %%xmm4 \n\t" \ + "mulpd %%xmm7, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Computes s+c*r assuming r is stored in xmm0,xmm1,xmm2 and that c +* has been loaded to xmm6,xmm7 by _sse_load_real_dble(z). The result +* appears in xmm0,xmm1,xmm2 all other xmm registers are unchanged +*/ + +#define _sse_mulr_vector_add_dble(s) \ +__asm__ __volatile__ ("mulpd %%xmm6, %%xmm0 \n\t" \ + "mulpd %%xmm7, %%xmm1 \n\t" \ + "mulpd %%xmm6, %%xmm2 \n\t" \ + "addpd %0, %%xmm0 \n\t" \ + "addpd %1, %%xmm1 \n\t" \ + "addpd %2, %%xmm2" \ + : \ + : \ + "m" ((s).c1), \ + "m" ((s).c2), \ + "m" ((s).c3) \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Computes s+c*r assuming r is stored in xmm3,xmm4,xmm5 and that c +* has been loaded to xmm6,xmm7 by _sse_load_real_dble(c). The result +* appears in xmm4,xmm5,xmm6 and all other xmm registers are unchanged +*/ + +#define _sse_mulr_vector_add_up_dble(s) \ +__asm__ __volatile__ ("mulpd %%xmm7, %%xmm3 \n\t" \ + "mulpd %%xmm6, %%xmm4 \n\t" \ + "mulpd %%xmm7, %%xmm5 \n\t" \ + "addpd %0, %%xmm3 \n\t" \ + "addpd %1, %%xmm4 \n\t" \ + "addpd %2, %%xmm5" \ + : \ + : \ + "m" ((s).c1), \ + "m" ((s).c2), \ + "m" ((s).c3) \ + : \ + "xmm3", "xmm4", "xmm5") + +/****************************************************************************** +* +* Action of su3 matrices on su3 vectors +* +******************************************************************************/ + +/* +* Multiplies an su3 vector s with an su3 matrix u, assuming s is +* stored in xmm0,xmm1,xmm2 +* +* On output the result is in xmm3,xmm4,xmm5 and the registers +* xmm0,xmm1,xmm2 are changed +*/ + +#define _sse_su3_multiply_dble(u) \ +__asm__ __volatile__ ("movddup %0, %%xmm3 \n\t" \ + "movddup %1, %%xmm6 \n\t" \ + "movddup %2, %%xmm4 \n\t" \ + "movddup %3, %%xmm7 \n\t" \ + "movddup %4, %%xmm5 \n\t" \ + "movddup %5, %%xmm8 \n\t" \ + "mulpd %%xmm0, %%xmm3 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm0, %%xmm4 \n\t" \ + "mulpd %%xmm1, %%xmm7 \n\t" \ + "mulpd %%xmm0, %%xmm5 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %%xmm1, %%xmm8 \n\t" \ + "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "addpd %%xmm8, %%xmm5" \ + : \ + : \ + "m" ((u).c11.re), \ + "m" ((u).c12.re), \ + "m" ((u).c21.re), \ + "m" ((u).c22.re), \ + "m" ((u).c31.re), \ + "m" ((u).c32.re) \ + : \ + "xmm0", "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("movddup %0, %%xmm9 \n\t" \ + "movddup %1, %%xmm10 \n\t" \ + "movddup %2, %%xmm11 \n\t" \ + "movddup %3, %%xmm12 \n\t" \ + "movddup %4, %%xmm13 \n\t" \ + "movddup %5, %%xmm14 \n\t" \ + "mulpd %%xmm2, %%xmm9 \n\t" \ + "mulpd %%xmm0, %%xmm10 \n\t" \ + "mulpd %%xmm2, %%xmm11 \n\t" \ + "mulpd %%xmm0, %%xmm12 \n\t" \ + "addpd %%xmm9, %%xmm3 \n\t" \ + "mulpd %%xmm2, %%xmm13 \n\t" \ + "addsubpd %%xmm10, %%xmm4 \n\t" \ + "mulpd %%xmm0, %%xmm14 \n\t" \ + "addpd %%xmm11, %%xmm5" \ + : \ + : \ + "m" ((u).c13.re), \ + "m" ((u).c21.im), \ + "m" ((u).c33.re), \ + "m" ((u).c11.im), \ + "m" ((u).c23.re), \ + "m" ((u).c31.im) \ + : \ + "xmm3", "xmm4", "xmm5", "xmm9", \ + "xmm10", "xmm11", "xmm12", "xmm13", \ + "xmm14"); \ +__asm__ __volatile__ ("shufpd $0x1, %%xmm1, %%xmm1 \n\t" \ + "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \ + "addsubpd %%xmm12, %%xmm3 \n\t" \ + "addpd %%xmm13, %%xmm4 \n\t" \ + "addsubpd %%xmm14, %%xmm5" \ + : \ + : \ + : \ + "xmm1", "xmm2", "xmm3", "xmm4", \ + "xmm5"); \ +__asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" \ + "movddup %1, %%xmm7 \n\t" \ + "movddup %2, %%xmm8 \n\t" \ + "movddup %3, %%xmm9 \n\t" \ + "movddup %4, %%xmm10 \n\t" \ + "movddup %5, %%xmm11 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm8 \n\t" \ + "mulpd %%xmm2, %%xmm9 \n\t" \ + "addsubpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %%xmm1, %%xmm10 \n\t" \ + "addsubpd %%xmm7, %%xmm4 \n\t" \ + "mulpd %%xmm2, %%xmm11 \n\t" \ + "addsubpd %%xmm8, %%xmm5" \ + : \ + : \ + "m" ((u).c12.im), \ + "m" ((u).c23.im), \ + "m" ((u).c32.im), \ + "m" ((u).c13.im), \ + "m" ((u).c22.im), \ + "m" ((u).c33.im) \ + : \ + "xmm3", "xmm4", "xmm5", "xmm6", \ + "xmm7", "xmm8", "xmm9", "xmm10", \ + "xmm11"); \ +__asm__ __volatile__ ("addsubpd %%xmm9, %%xmm3 \n\t" \ + "addsubpd %%xmm10, %%xmm4 \n\t" \ + "addsubpd %%xmm11, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies an su3 vector s with an su3 matrix u^dagger, assuming s is +* stored in xmm0,xmm1,xmm2 +* +* On output the result is in xmm3,xmm4,xmm5 and the registers +* xmm0,xmm1,xmm2 are changed +*/ + +#define _sse_su3_inverse_multiply_dble(u) \ +__asm__ __volatile__ ("movddup %0, %%xmm3 \n\t" \ + "movddup %1, %%xmm6 \n\t" \ + "movddup %2, %%xmm4 \n\t" \ + "movddup %3, %%xmm7 \n\t" \ + "movddup %4, %%xmm5 \n\t" \ + "movddup %5, %%xmm8 \n\t" \ + "mulpd %%xmm0, %%xmm3 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm0, %%xmm4 \n\t" \ + "mulpd %%xmm1, %%xmm7 \n\t" \ + "mulpd %%xmm0, %%xmm5 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %6, %%xmm0 \n\t" \ + "mulpd %%xmm1, %%xmm8 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "addpd %%xmm8, %%xmm5 \n\t" \ + "shufpd $0x1, %%xmm0, %%xmm0" \ + : \ + : \ + "m" ((u).c11.re), \ + "m" ((u).c21.re), \ + "m" ((u).c12.re), \ + "m" ((u).c22.re), \ + "m" ((u).c13.re), \ + "m" ((u).c23.re), \ + "m" (_sse_sgn1_dble) \ + : \ + "xmm0", "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("movddup %0, %%xmm9 \n\t" \ + "movddup %1, %%xmm10 \n\t" \ + "movddup %2, %%xmm11 \n\t" \ + "movddup %3, %%xmm12 \n\t" \ + "movddup %4, %%xmm13 \n\t" \ + "movddup %5, %%xmm14 \n\t" \ + "mulpd %%xmm2, %%xmm9 \n\t" \ + "mulpd %6, %%xmm1 \n\t" \ + "mulpd %%xmm0, %%xmm10 \n\t" \ + "mulpd %%xmm2, %%xmm11 \n\t" \ + "mulpd %%xmm0, %%xmm12 \n\t" \ + "addpd %%xmm9, %%xmm3 \n\t" \ + "mulpd %%xmm2, %%xmm13 \n\t" \ + "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \ + "addpd %%xmm10, %%xmm4 \n\t" \ + "mulpd %6, %%xmm2 \n\t" \ + "addpd %%xmm11, %%xmm5 \n\t" \ + "mulpd %%xmm0, %%xmm14 \n\t" \ + "shufpd $0x1, %%xmm2, %%xmm2" \ + : \ + : \ + "m" ((u).c31.re), \ + "m" ((u).c12.im), \ + "m" ((u).c33.re), \ + "m" ((u).c11.im), \ + "m" ((u).c32.re), \ + "m" ((u).c13.im), \ + "m" (_sse_sgn1_dble) \ + : \ + "xmm1", "xmm2", "xmm3", "xmm4", \ + "xmm5", "xmm9", "xmm10", "xmm11", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" \ + "movddup %1, %%xmm7 \n\t" \ + "movddup %2, %%xmm8 \n\t" \ + "movddup %3, %%xmm9 \n\t" \ + "movddup %4, %%xmm10 \n\t" \ + "movddup %5, %%xmm11 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "addpd %%xmm12, %%xmm3 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "addpd %%xmm13, %%xmm4 \n\t" \ + "mulpd %%xmm1, %%xmm8 \n\t" \ + "addpd %%xmm14, %%xmm5 \n\t" \ + "mulpd %%xmm2, %%xmm9 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %%xmm1, %%xmm10 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "mulpd %%xmm2, %%xmm11 \n\t" \ + "addpd %%xmm8, %%xmm5" \ + : \ + : \ + "m" ((u).c21.im), \ + "m" ((u).c32.im), \ + "m" ((u).c23.im), \ + "m" ((u).c31.im), \ + "m" ((u).c22.im), \ + "m" ((u).c33.im) \ + : \ + "xmm3", "xmm4", "xmm5", "xmm6", \ + "xmm7", "xmm8", "xmm9", "xmm10", \ + "xmm11"); \ +__asm__ __volatile__ ("addpd %%xmm9, %%xmm3 \n\t" \ + "addpd %%xmm10, %%xmm4 \n\t" \ + "addpd %%xmm11, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/su3.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/su3.h new file mode 100644 index 0000000000000000000000000000000000000000..a7ce06e37455cccf55f1197798ce372639182a3c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/su3.h @@ -0,0 +1,658 @@ + +/******************************************************************************* +* +* File su3.h +* +* Copyright (C) 2005, 2009, 2011, 2013 Martin Luescher, Filippo Palombi +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Type definitions and macros for SU(3) matrices, SU(3) vectors and Dirac +* spinors +* +*******************************************************************************/ + +#ifndef SU3_H +#define SU3_H + +#if ((defined AVX)&&(!(defined x64))) +#define x64 +#endif + +#if (defined x64) +#define ALIGNED8 __attribute__ ((aligned (8))) +#define ALIGNED16 __attribute__ ((aligned (16))) +#define ALIGNED32 __attribute__ ((aligned (32))) +#else +#define ALIGNED8 +#define ALIGNED16 +#define ALIGNED32 +#endif + +typedef struct +{ + float re,im; +} complex; + +typedef struct +{ + complex c1,c2,c3; +} su3_vector; + +typedef struct +{ + complex c11,c12,c13,c21,c22,c23,c31,c32,c33; +} su3; + +typedef struct +{ + float c1,c2,c3,c4,c5,c6,c7,c8; +} su3_alg; + +typedef struct +{ + su3_vector c1,c2; +} weyl; + +typedef struct +{ + su3_vector c1,c2,c3,c4; +} spinor; + +typedef struct +{ + float u[36]; +} pauli; + +typedef struct +{ + float c1,c2,c3,c4,c5,c6,c7,c8,c9; +} u3_alg; + +typedef struct +{ + double re,im; +} complex_dble; + +typedef struct +{ + complex_dble c1,c2,c3; +} su3_vector_dble; + +typedef struct +{ + complex_dble c11,c12,c13,c21,c22,c23,c31,c32,c33; +} su3_dble; + +typedef struct +{ + double c1,c2,c3,c4,c5,c6,c7,c8; +} su3_alg_dble; + +typedef struct +{ + su3_vector_dble c1,c2; +} weyl_dble; + +typedef struct +{ + su3_vector_dble c1,c2,c3,c4; +} spinor_dble; + +typedef struct +{ + double u[36]; +} pauli_dble; + +typedef struct +{ + double c1,c2,c3,c4,c5,c6,c7,c8,c9; +} u3_alg_dble; + +/******************************************************************************* +* +* The following macros are the same for single and double precision types +* +* Depending on the macro, arguments are variables of type su3_vector and su3 +* (or su3_vector_dble and su3_dble) +* +*******************************************************************************/ + +/* +* r.c1=c*s.c1 (c real) +* r.c2=c*s.c2 +* r.c3=c*s.c3 +*/ + +#define _vector_mul(r,c,s) \ + (r).c1.re=(c)*(s).c1.re; \ + (r).c1.im=(c)*(s).c1.im; \ + (r).c2.re=(c)*(s).c2.re; \ + (r).c2.im=(c)*(s).c2.im; \ + (r).c3.re=(c)*(s).c3.re; \ + (r).c3.im=(c)*(s).c3.im + +/* +* r.c1=i*c*s.c1 (c real) +* r.c2=i*c*s.c2 +* r.c3=i*c*s.c3 +*/ + +#define _vector_imul(r,c,s) \ + (r).c1.re=-(c)*(s).c1.im; \ + (r).c1.im= (c)*(s).c1.re; \ + (r).c2.re=-(c)*(s).c2.im; \ + (r).c2.im= (c)*(s).c2.re; \ + (r).c3.re=-(c)*(s).c3.im; \ + (r).c3.im= (c)*(s).c3.re + +/* +* r.c1=c*s.c1 (c complex) +* r.c2=c*s.c2 +* r.c3=c*s.c3 +*/ + +#define _vector_mulc(r,c,s) \ + (r).c1.re=(c).re*(s).c1.re-(c).im*(s).c1.im; \ + (r).c1.im=(c).re*(s).c1.im+(c).im*(s).c1.re; \ + (r).c2.re=(c).re*(s).c2.re-(c).im*(s).c2.im; \ + (r).c2.im=(c).re*(s).c2.im+(c).im*(s).c2.re; \ + (r).c3.re=(c).re*(s).c3.re-(c).im*(s).c3.im; \ + (r).c3.im=(c).re*(s).c3.im+(c).im*(s).c3.re + +/* +* r.c1=s1.c1+s2.c1 +* r.c2=s1.c2+s2.c2 +* r.c3=s1.c3+s2.c3 +*/ + +#define _vector_add(r,s1,s2) \ + (r).c1.re=(s1).c1.re+(s2).c1.re; \ + (r).c1.im=(s1).c1.im+(s2).c1.im; \ + (r).c2.re=(s1).c2.re+(s2).c2.re; \ + (r).c2.im=(s1).c2.im+(s2).c2.im; \ + (r).c3.re=(s1).c3.re+(s2).c3.re; \ + (r).c3.im=(s1).c3.im+(s2).c3.im + +/* +* r.c1=s1.c1-s2.c1 +* r.c2=s1.c2-s2.c2 +* r.c3=s1.c3-s2.c3 +*/ + +#define _vector_sub(r,s1,s2) \ + (r).c1.re=(s1).c1.re-(s2).c1.re; \ + (r).c1.im=(s1).c1.im-(s2).c1.im; \ + (r).c2.re=(s1).c2.re-(s2).c2.re; \ + (r).c2.im=(s1).c2.im-(s2).c2.im; \ + (r).c3.re=(s1).c3.re-(s2).c3.re; \ + (r).c3.im=(s1).c3.im-(s2).c3.im + +/* +* r.c1=s1.c1+i*s2.c1 +* r.c2=s1.c2+i*s2.c2 +* r.c3=s1.c3+i*s2.c3 +*/ + +#define _vector_i_add(r,s1,s2) \ + (r).c1.re=(s1).c1.re-(s2).c1.im; \ + (r).c1.im=(s1).c1.im+(s2).c1.re; \ + (r).c2.re=(s1).c2.re-(s2).c2.im; \ + (r).c2.im=(s1).c2.im+(s2).c2.re; \ + (r).c3.re=(s1).c3.re-(s2).c3.im; \ + (r).c3.im=(s1).c3.im+(s2).c3.re + +/* +* r.c1=s1.c1+i*s2.c1 +* r.c2=s1.c2+i*s2.c2 +* r.c3=s1.c3+i*s2.c3 +*/ + +#define _vector_i_sub(r,s1,s2) \ + (r).c1.re=(s1).c1.re+(s2).c1.im; \ + (r).c1.im=(s1).c1.im-(s2).c1.re; \ + (r).c2.re=(s1).c2.re+(s2).c2.im; \ + (r).c2.im=(s1).c2.im-(s2).c2.re; \ + (r).c3.re=(s1).c3.re+(s2).c3.im; \ + (r).c3.im=(s1).c3.im-(s2).c3.re + +/* +* r.c1+=s.c1 +* r.c2+=s.c2 +* r.c3+=s.c3 +*/ + +#define _vector_add_assign(r,s) \ + (r).c1.re+=(s).c1.re; \ + (r).c1.im+=(s).c1.im; \ + (r).c2.re+=(s).c2.re; \ + (r).c2.im+=(s).c2.im; \ + (r).c3.re+=(s).c3.re; \ + (r).c3.im+=(s).c3.im + +/* +* r.c1-=s.c1 +* r.c2-=s.c2 +* r.c3-=s.c3 +*/ + +#define _vector_sub_assign(r,s) \ + (r).c1.re-=(s).c1.re; \ + (r).c1.im-=(s).c1.im; \ + (r).c2.re-=(s).c2.re; \ + (r).c2.im-=(s).c2.im; \ + (r).c3.re-=(s).c3.re; \ + (r).c3.im-=(s).c3.im + +/* +* r.c1+=i*s.c1 +* r.c2+=i*s.c2 +* r.c3+=i*s.c3 +*/ + +#define _vector_i_add_assign(r,s) \ + (r).c1.re-=(s).c1.im; \ + (r).c1.im+=(s).c1.re; \ + (r).c2.re-=(s).c2.im; \ + (r).c2.im+=(s).c2.re; \ + (r).c3.re-=(s).c3.im; \ + (r).c3.im+=(s).c3.re + +/* +* r.c1-=i*s.c1 +* r.c2-=i*s.c2 +* r.c3-=i*s.c3 +*/ + +#define _vector_i_sub_assign(r,s) \ + (r).c1.re+=(s).c1.im; \ + (r).c1.im-=(s).c1.re; \ + (r).c2.re+=(s).c2.im; \ + (r).c2.im-=(s).c2.re; \ + (r).c3.re+=(s).c3.im; \ + (r).c3.im-=(s).c3.re + +/* +* Real part of the scalar product (r,s) +*/ + +#define _vector_prod_re(r,s) \ + (r).c1.re*(s).c1.re+(r).c1.im*(s).c1.im+ \ + (r).c2.re*(s).c2.re+(r).c2.im*(s).c2.im+ \ + (r).c3.re*(s).c3.re+(r).c3.im*(s).c3.im + +/* +* Imaginary part of the scalar product (r,s) +*/ + +#define _vector_prod_im(r,s) \ + (r).c1.re*(s).c1.im-(r).c1.im*(s).c1.re+ \ + (r).c2.re*(s).c2.im-(r).c2.im*(s).c2.re+ \ + (r).c3.re*(s).c3.im-(r).c3.im*(s).c3.re + +/* +* r.c1+=c*s.c1 (c real) +* r.c2+=c*s.c2 +* r.c3+=c*s.c3 +*/ + +#define _vector_mulr_assign(r,c,s) \ + (r).c1.re+=(c)*(s).c1.re; \ + (r).c1.im+=(c)*(s).c1.im; \ + (r).c2.re+=(c)*(s).c2.re; \ + (r).c2.im+=(c)*(s).c2.im; \ + (r).c3.re+=(c)*(s).c3.re; \ + (r).c3.im+=(c)*(s).c3.im + +/* +* r.c1+=i*c*s.c1 (c real) +* r.c2+=i*c*s.c2 +* r.c3+=i*c*s.c3 +*/ + +#define _vector_mulir_assign(r,c,s) \ + (r).c1.re-=(c)*(s).c1.im; \ + (r).c1.im+=(c)*(s).c1.re; \ + (r).c2.re-=(c)*(s).c2.im; \ + (r).c2.im+=(c)*(s).c2.re; \ + (r).c3.re-=(c)*(s).c3.im; \ + (r).c3.im+=(c)*(s).c3.re + +/* +* r.c1+=z*s.c1 (z of type complex) +* r.c2+=z*s.c2 +* r.c3+=z*s.c3 +*/ + +#define _vector_mulc_assign(r,z,s) \ + (r).c1.re+=((z).re*(s).c1.re-(z).im*(s).c1.im); \ + (r).c1.im+=((z).re*(s).c1.im+(z).im*(s).c1.re); \ + (r).c2.re+=((z).re*(s).c2.re-(z).im*(s).c2.im); \ + (r).c2.im+=((z).re*(s).c2.im+(z).im*(s).c2.re); \ + (r).c3.re+=((z).re*(s).c3.re-(z).im*(s).c3.im); \ + (r).c3.im+=((z).re*(s).c3.im+(z).im*(s).c3.re) + +/* +* r.c1-=z*s.c1 (z of type complex) +* r.c2-=z*s.c2 +* r.c3-=z*s.c3 +*/ + +#define _vector_project(r,z,s) \ + (r).c1.re-=((z).re*(s).c1.re-(z).im*(s).c1.im); \ + (r).c1.im-=((z).re*(s).c1.im+(z).im*(s).c1.re); \ + (r).c2.re-=((z).re*(s).c2.re-(z).im*(s).c2.im); \ + (r).c2.im-=((z).re*(s).c2.im+(z).im*(s).c2.re); \ + (r).c3.re-=((z).re*(s).c3.re-(z).im*(s).c3.im); \ + (r).c3.im-=((z).re*(s).c3.im+(z).im*(s).c3.re) + +/* +* r.c1=c*r.c1+s.c1 (c real) +* r.c2=c*r.c2+s.c2 +* r.c3=c*r.c3+s.c3 +*/ + +#define _vector_mulr_add(r,c,s) \ + (r).c1.re=(c)*(r).c1.re+(s).c1.re; \ + (r).c1.im=(c)*(r).c1.im+(s).c1.im; \ + (r).c2.re=(c)*(r).c2.re+(s).c2.re; \ + (r).c2.im=(c)*(r).c2.im+(s).c2.im; \ + (r).c3.re=(c)*(r).c3.re+(s).c3.re; \ + (r).c3.im=(c)*(r).c3.im+(s).c3.im + +/* +* r.c1=cr*r.c1+cs*s.c1 (cr,cs real) +* r.c2=cr*r.c2+cs*s.c2 +* r.c3=cr*r.c3+cs*s.c3 +*/ + +#define _vector_combine(r,s,cr,cs) \ + (r).c1.re=(cr)*(r).c1.re+(cs)*(s).c1.re; \ + (r).c1.im=(cr)*(r).c1.im+(cs)*(s).c1.im; \ + (r).c2.re=(cr)*(r).c2.re+(cs)*(s).c2.re; \ + (r).c2.im=(cr)*(r).c2.im+(cs)*(s).c2.im; \ + (r).c3.re=(cr)*(r).c3.re+(cs)*(s).c3.re; \ + (r).c3.im=(cr)*(r).c3.im+(cs)*(s).c3.im + +/* +* v.c1=(w.c2*z.c3-w.c3*z.c2)^* +* v.c2=(w.c3*z.c1-w.c1*z.c3)^* +* v.c3=(w.c1*z.c2-w.c2*z.c1)^* +*/ + +#define _vector_cross_prod(v,w,z) \ + (v).c1.re= (w).c2.re*(z).c3.re-(w).c2.im*(z).c3.im \ + -(w).c3.re*(z).c2.re+(w).c3.im*(z).c2.im; \ + (v).c1.im= (w).c3.re*(z).c2.im+(w).c3.im*(z).c2.re \ + -(w).c2.re*(z).c3.im-(w).c2.im*(z).c3.re; \ + (v).c2.re= (w).c3.re*(z).c1.re-(w).c3.im*(z).c1.im \ + -(w).c1.re*(z).c3.re+(w).c1.im*(z).c3.im; \ + (v).c2.im= (w).c1.re*(z).c3.im+(w).c1.im*(z).c3.re \ + -(w).c3.re*(z).c1.im-(w).c3.im*(z).c1.re; \ + (v).c3.re= (w).c1.re*(z).c2.re-(w).c1.im*(z).c2.im \ + -(w).c2.re*(z).c1.re+(w).c2.im*(z).c1.im; \ + (v).c3.im= (w).c2.re*(z).c1.im+(w).c2.im*(z).c1.re \ + -(w).c1.re*(z).c2.im-(w).c1.im*(z).c2.re + +/* +* SU(3) matrix u times SU(3) vector s +* +* r.c1=(u*s).c1 +* r.c2=(u*s).c2 +* r.c3=(u*s).c3 +*/ + +#define _su3_multiply(r,u,s) \ + (r).c1.re= (u).c11.re*(s).c1.re-(u).c11.im*(s).c1.im \ + +(u).c12.re*(s).c2.re-(u).c12.im*(s).c2.im \ + +(u).c13.re*(s).c3.re-(u).c13.im*(s).c3.im; \ + (r).c1.im= (u).c11.re*(s).c1.im+(u).c11.im*(s).c1.re \ + +(u).c12.re*(s).c2.im+(u).c12.im*(s).c2.re \ + +(u).c13.re*(s).c3.im+(u).c13.im*(s).c3.re; \ + (r).c2.re= (u).c21.re*(s).c1.re-(u).c21.im*(s).c1.im \ + +(u).c22.re*(s).c2.re-(u).c22.im*(s).c2.im \ + +(u).c23.re*(s).c3.re-(u).c23.im*(s).c3.im; \ + (r).c2.im= (u).c21.re*(s).c1.im+(u).c21.im*(s).c1.re \ + +(u).c22.re*(s).c2.im+(u).c22.im*(s).c2.re \ + +(u).c23.re*(s).c3.im+(u).c23.im*(s).c3.re; \ + (r).c3.re= (u).c31.re*(s).c1.re-(u).c31.im*(s).c1.im \ + +(u).c32.re*(s).c2.re-(u).c32.im*(s).c2.im \ + +(u).c33.re*(s).c3.re-(u).c33.im*(s).c3.im; \ + (r).c3.im= (u).c31.re*(s).c1.im+(u).c31.im*(s).c1.re \ + +(u).c32.re*(s).c2.im+(u).c32.im*(s).c2.re \ + +(u).c33.re*(s).c3.im+(u).c33.im*(s).c3.re + +/* +* SU(3) matrix u^dagger times SU(3) vector s +* +* r.c1=(u^dagger*s).c1 +* r.c2=(u^dagger*s).c2 +* r.c3=(u^dagger*s).c3 +*/ + +#define _su3_inverse_multiply(r,u,s) \ + (r).c1.re= (u).c11.re*(s).c1.re+(u).c11.im*(s).c1.im \ + +(u).c21.re*(s).c2.re+(u).c21.im*(s).c2.im \ + +(u).c31.re*(s).c3.re+(u).c31.im*(s).c3.im; \ + (r).c1.im= (u).c11.re*(s).c1.im-(u).c11.im*(s).c1.re \ + +(u).c21.re*(s).c2.im-(u).c21.im*(s).c2.re \ + +(u).c31.re*(s).c3.im-(u).c31.im*(s).c3.re; \ + (r).c2.re= (u).c12.re*(s).c1.re+(u).c12.im*(s).c1.im \ + +(u).c22.re*(s).c2.re+(u).c22.im*(s).c2.im \ + +(u).c32.re*(s).c3.re+(u).c32.im*(s).c3.im; \ + (r).c2.im= (u).c12.re*(s).c1.im-(u).c12.im*(s).c1.re \ + +(u).c22.re*(s).c2.im-(u).c22.im*(s).c2.re \ + +(u).c32.re*(s).c3.im-(u).c32.im*(s).c3.re; \ + (r).c3.re= (u).c13.re*(s).c1.re+(u).c13.im*(s).c1.im \ + +(u).c23.re*(s).c2.re+(u).c23.im*(s).c2.im \ + +(u).c33.re*(s).c3.re+(u).c33.im*(s).c3.im; \ + (r).c3.im= (u).c13.re*(s).c1.im-(u).c13.im*(s).c1.re \ + +(u).c23.re*(s).c2.im-(u).c23.im*(s).c2.re \ + +(u).c33.re*(s).c3.im-(u).c33.im*(s).c3.re + +/******************************************************************************* +* +* Macros for SU(3) matrices +* +* Arguments are variables of type su3 +* +*******************************************************************************/ + +/* +* u=v^dagger +*/ + +#define _su3_dagger(u,v) \ + (u).c11.re= (v).c11.re; \ + (u).c11.im=-(v).c11.im; \ + (u).c12.re= (v).c21.re; \ + (u).c12.im=-(v).c21.im; \ + (u).c13.re= (v).c31.re; \ + (u).c13.im=-(v).c31.im; \ + (u).c21.re= (v).c12.re; \ + (u).c21.im=-(v).c12.im; \ + (u).c22.re= (v).c22.re; \ + (u).c22.im=-(v).c22.im; \ + (u).c23.re= (v).c32.re; \ + (u).c23.im=-(v).c32.im; \ + (u).c31.re= (v).c13.re; \ + (u).c31.im=-(v).c13.im; \ + (u).c32.re= (v).c23.re; \ + (u).c32.im=-(v).c23.im; \ + (u).c33.re= (v).c33.re; \ + (u).c33.im=-(v).c33.im + +/* +* u=v*w +*/ + +#define _su3_times_su3(u,v,w) \ + (u).c11.re= (v).c11.re*(w).c11.re-(v).c11.im*(w).c11.im \ + +(v).c12.re*(w).c21.re-(v).c12.im*(w).c21.im \ + +(v).c13.re*(w).c31.re-(v).c13.im*(w).c31.im; \ + (u).c11.im= (v).c11.re*(w).c11.im+(v).c11.im*(w).c11.re \ + +(v).c12.re*(w).c21.im+(v).c12.im*(w).c21.re \ + +(v).c13.re*(w).c31.im+(v).c13.im*(w).c31.re; \ + (u).c12.re= (v).c11.re*(w).c12.re-(v).c11.im*(w).c12.im \ + +(v).c12.re*(w).c22.re-(v).c12.im*(w).c22.im \ + +(v).c13.re*(w).c32.re-(v).c13.im*(w).c32.im; \ + (u).c12.im= (v).c11.re*(w).c12.im+(v).c11.im*(w).c12.re \ + +(v).c12.re*(w).c22.im+(v).c12.im*(w).c22.re \ + +(v).c13.re*(w).c32.im+(v).c13.im*(w).c32.re; \ + (u).c13.re= (v).c11.re*(w).c13.re-(v).c11.im*(w).c13.im \ + +(v).c12.re*(w).c23.re-(v).c12.im*(w).c23.im \ + +(v).c13.re*(w).c33.re-(v).c13.im*(w).c33.im; \ + (u).c13.im= (v).c11.re*(w).c13.im+(v).c11.im*(w).c13.re \ + +(v).c12.re*(w).c23.im+(v).c12.im*(w).c23.re \ + +(v).c13.re*(w).c33.im+(v).c13.im*(w).c33.re; \ + (u).c21.re= (v).c21.re*(w).c11.re-(v).c21.im*(w).c11.im \ + +(v).c22.re*(w).c21.re-(v).c22.im*(w).c21.im \ + +(v).c23.re*(w).c31.re-(v).c23.im*(w).c31.im; \ + (u).c21.im= (v).c21.re*(w).c11.im+(v).c21.im*(w).c11.re \ + +(v).c22.re*(w).c21.im+(v).c22.im*(w).c21.re \ + +(v).c23.re*(w).c31.im+(v).c23.im*(w).c31.re; \ + (u).c22.re= (v).c21.re*(w).c12.re-(v).c21.im*(w).c12.im \ + +(v).c22.re*(w).c22.re-(v).c22.im*(w).c22.im \ + +(v).c23.re*(w).c32.re-(v).c23.im*(w).c32.im; \ + (u).c22.im= (v).c21.re*(w).c12.im+(v).c21.im*(w).c12.re \ + +(v).c22.re*(w).c22.im+(v).c22.im*(w).c22.re \ + +(v).c23.re*(w).c32.im+(v).c23.im*(w).c32.re; \ + (u).c23.re= (v).c21.re*(w).c13.re-(v).c21.im*(w).c13.im \ + +(v).c22.re*(w).c23.re-(v).c22.im*(w).c23.im \ + +(v).c23.re*(w).c33.re-(v).c23.im*(w).c33.im; \ + (u).c23.im= (v).c21.re*(w).c13.im+(v).c21.im*(w).c13.re \ + +(v).c22.re*(w).c23.im+(v).c22.im*(w).c23.re \ + +(v).c23.re*(w).c33.im+(v).c23.im*(w).c33.re; \ + (u).c31.re= (v).c31.re*(w).c11.re-(v).c31.im*(w).c11.im \ + +(v).c32.re*(w).c21.re-(v).c32.im*(w).c21.im \ + +(v).c33.re*(w).c31.re-(v).c33.im*(w).c31.im; \ + (u).c31.im= (v).c31.re*(w).c11.im+(v).c31.im*(w).c11.re \ + +(v).c32.re*(w).c21.im+(v).c32.im*(w).c21.re \ + +(v).c33.re*(w).c31.im+(v).c33.im*(w).c31.re; \ + (u).c32.re= (v).c31.re*(w).c12.re-(v).c31.im*(w).c12.im \ + +(v).c32.re*(w).c22.re-(v).c32.im*(w).c22.im \ + +(v).c33.re*(w).c32.re-(v).c33.im*(w).c32.im; \ + (u).c32.im= (v).c31.re*(w).c12.im+(v).c31.im*(w).c12.re \ + +(v).c32.re*(w).c22.im+(v).c32.im*(w).c22.re \ + +(v).c33.re*(w).c32.im+(v).c33.im*(w).c32.re; \ + (u).c33.re= (v).c31.re*(w).c13.re-(v).c31.im*(w).c13.im \ + +(v).c32.re*(w).c23.re-(v).c32.im*(w).c23.im \ + +(v).c33.re*(w).c33.re-(v).c33.im*(w).c33.im; \ + (u).c33.im= (v).c31.re*(w).c13.im+(v).c31.im*(w).c13.re \ + +(v).c32.re*(w).c23.im+(v).c32.im*(w).c23.re \ + +(v).c33.re*(w).c33.im+(v).c33.im*(w).c33.re + +/******************************************************************************* +* +* Macros for variables of type su3_alg +* +*******************************************************************************/ + +/* +* r+=s +*/ + +#define _su3_alg_add_assign(r,s) \ + (r).c1+=(s).c1; \ + (r).c2+=(s).c2; \ + (r).c3+=(s).c3; \ + (r).c4+=(s).c4; \ + (r).c5+=(s).c5; \ + (r).c6+=(s).c6; \ + (r).c7+=(s).c7; \ + (r).c8+=(s).c8 + +/* +* r-=s +*/ + +#define _su3_alg_sub_assign(r,s) \ + (r).c1-=(s).c1; \ + (r).c2-=(s).c2; \ + (r).c3-=(s).c3; \ + (r).c4-=(s).c4; \ + (r).c5-=(s).c5; \ + (r).c6-=(s).c6; \ + (r).c7-=(s).c7; \ + (r).c8-=(s).c8 + +/* +* s*=c, c real +*/ + +#define _su3_alg_mul_assign(s,c) \ + (s).c1*=(c); \ + (s).c2*=(c); \ + (s).c3*=(c); \ + (s).c4*=(c); \ + (s).c5*=(c); \ + (s).c6*=(c); \ + (s).c7*=(c); \ + (s).c8*=(c) + +/* +* r+=c*s, c real +*/ + +#define _su3_alg_mul_add_assign(r,c,s) \ + (r).c1+=(c)*(s).c1; \ + (r).c2+=(c)*(s).c2; \ + (r).c3+=(c)*(s).c3; \ + (r).c4+=(c)*(s).c4; \ + (r).c5+=(c)*(s).c5; \ + (r).c6+=(c)*(s).c6; \ + (r).c7+=(c)*(s).c7; \ + (r).c8+=(c)*(s).c8 + +/* +* r-=c*s, c real +*/ + +#define _su3_alg_mul_sub_assign(r,c,s) \ + (r).c1-=(c)*(s).c1; \ + (r).c2-=(c)*(s).c2; \ + (r).c3-=(c)*(s).c3; \ + (r).c4-=(c)*(s).c4; \ + (r).c5-=(c)*(s).c5; \ + (r).c6-=(c)*(s).c6; \ + (r).c7-=(c)*(s).c7; \ + (r).c8-=(c)*(s).c8 + +/******************************************************************************* +* +* Macros for variables of type u3_alg +* +*******************************************************************************/ + +/* +* r=c*(u+v) +*/ + +#define _u3_alg_mul_add(r,c,u,v) \ + (r).c1=(c)*((u).c1+(v).c1); \ + (r).c2=(c)*((u).c2+(v).c2); \ + (r).c3=(c)*((u).c3+(v).c3); \ + (r).c4=(c)*((u).c4+(v).c4); \ + (r).c5=(c)*((u).c5+(v).c5); \ + (r).c6=(c)*((u).c6+(v).c6); \ + (r).c7=(c)*((u).c7+(v).c7); \ + (r).c8=(c)*((u).c8+(v).c8); \ + (r).c9=(c)*((u).c9+(v).c9) + +/* +* r=c*(u-v) +*/ + +#define _u3_alg_mul_sub(r,c,u,v) \ + (r).c1=(c)*((u).c1-(v).c1); \ + (r).c2=(c)*((u).c2-(v).c2); \ + (r).c3=(c)*((u).c3-(v).c3); \ + (r).c4=(c)*((u).c4-(v).c4); \ + (r).c5=(c)*((u).c5-(v).c5); \ + (r).c6=(c)*((u).c6-(v).c6); \ + (r).c7=(c)*((u).c7-(v).c7); \ + (r).c8=(c)*((u).c8-(v).c8); \ + (r).c9=(c)*((u).c9-(v).c9) + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/su3fcts.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/su3fcts.h new file mode 100644 index 0000000000000000000000000000000000000000..6f40ca8a7669087a0b0b78e601e8186b3c23dee8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/su3fcts.h @@ -0,0 +1,87 @@ + +/******************************************************************************* +* +* File su3fcts.h +* +* Copyright (C) 2010, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef SU3FCTS_H +#define SU3FCTS_H + +#ifndef SU3_H +#include "su3.h" +#endif + +typedef struct +{ + double t,d; + complex_dble p[3]; +} ch_drv0_t; + +typedef struct +{ + double t,d; + complex_dble p[3]; + complex_dble pt[3],pd[3]; +} ch_drv1_t; + +typedef struct +{ + double t,d; + complex_dble p[3]; + complex_dble pt[3],pd[3]; + complex_dble ptt[3],ptd[3],pdd[3]; +} ch_drv2_t; + +/* CHEXP_C */ +extern void ch2mat(complex_dble *p,su3_alg_dble *X,su3_dble *u); +extern void chexp_drv0(su3_alg_dble *X,ch_drv0_t *s); +extern void chexp_drv1(su3_alg_dble *X,ch_drv1_t *s); +extern void chexp_drv2(su3_alg_dble *X,ch_drv2_t *s); +extern void expXsu3(double eps,su3_alg_dble *X,su3_dble *u); + +/* CM3X3_C */ +extern void cm3x3_zero(int vol,su3_dble *u); +extern void cm3x3_unity(int vol,su3_dble *u); +extern void cm3x3_assign(int vol,su3_dble *u,su3_dble *v); +extern void cm3x3_swap(int vol,su3_dble *u,su3_dble *v); +extern void cm3x3_dagger(su3_dble *u,su3_dble *v); +extern void cm3x3_tr(su3_dble *u,su3_dble *v,complex_dble *tr); +extern void cm3x3_retr(su3_dble *u,su3_dble *v,double *tr); +extern void cm3x3_imtr(su3_dble *u,su3_dble *v,double *tr); +extern void cm3x3_add(su3_dble *u,su3_dble *v); +extern void cm3x3_mul_add(su3_dble *u,su3_dble *v,su3_dble *w); +extern void cm3x3_mulr(double *r,su3_dble *u,su3_dble *v); +extern void cm3x3_mulr_add(double *r,su3_dble *u,su3_dble *v); +extern void cm3x3_mulc(complex_dble *c,su3_dble *u,su3_dble *v); +extern void cm3x3_mulc_add(complex_dble *c,su3_dble *u,su3_dble *v); +extern void cm3x3_lc1(complex_dble *c,su3_dble *u,su3_dble *v); +extern void cm3x3_lc2(complex_dble *c,su3_dble *u,su3_dble *v); + +/* RANDOM_SU3_C */ +extern void random_su3(su3 *u); +extern void random_su3_dble(su3_dble *u); + +/* SU3REN_C */ +extern void project_to_su3(su3 *u); +extern void project_to_su3_dble(su3_dble *u); + +/* SU3PROD_C */ +extern void su3xsu3(su3_dble *u,su3_dble *v,su3_dble *w); +extern void su3dagxsu3(su3_dble *u,su3_dble *v,su3_dble *w); +extern void su3xsu3dag(su3_dble *u,su3_dble *v,su3_dble *w); +extern void su3dagxsu3dag(su3_dble *u,su3_dble *v,su3_dble *w); +extern void su3xu3alg(su3_dble *u,u3_alg_dble *X,su3_dble *v); +extern void su3dagxu3alg(su3_dble *u,u3_alg_dble *X,su3_dble *v); +extern void u3algxsu3(u3_alg_dble *X,su3_dble *u,su3_dble *v); +extern void u3algxsu3dag(u3_alg_dble *X,su3_dble *u,su3_dble *v); +extern double prod2su3alg(su3_dble *u,su3_dble *v,su3_alg_dble *X); +extern void prod2u3alg(su3_dble *u,su3_dble *v,u3_alg_dble *X); +extern void rotate_su3alg(su3_dble *u,su3_alg_dble *X); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sw_term.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sw_term.h new file mode 100644 index 0000000000000000000000000000000000000000..f1ef62241ea64fc87e0dfbb0de4c8deab887525e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sw_term.h @@ -0,0 +1,47 @@ + +/******************************************************************************* +* +* File sw_term.h +* +* Copyright (C) 2005, 2009, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef SW_TERM_H +#define SW_TERM_H + +#ifndef SU3_H +#include "su3.h" +#endif + +#ifndef UTILS_H +#include "utils.h" +#endif + +/* PAULI_C */ +extern void mul_pauli(float mu,pauli *m,weyl *s,weyl *r); +extern void mul_pauli2(float mu,pauli *m,spinor *s,spinor *r); +extern void assign_pauli(int vol,pauli_dble *md,pauli *m); +extern void apply_sw(int vol,float mu,pauli *m,spinor *s,spinor *r); + +/* PAULI_DBLE_C */ +extern void mul_pauli_dble(double mu,pauli_dble *m,weyl_dble *s,weyl_dble *r); +extern int inv_pauli_dble(double mu,pauli_dble *m,pauli_dble *im); +extern complex_dble det_pauli_dble(double mu,pauli_dble *m); +extern void apply_sw_dble(int vol,double mu,pauli_dble *m,spinor_dble *s, + spinor_dble *r); +extern int apply_swinv_dble(int vol,double mu,pauli_dble *m,spinor_dble *s, + spinor_dble *r); + +/* SWFLDS_C */ +extern pauli *swfld(void); +extern pauli_dble *swdfld(void); +extern void assign_swd2sw(void); + +/* SW_TERM_C */ +extern int sw_term(ptset_t set); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/tcharge.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/tcharge.h new file mode 100644 index 0000000000000000000000000000000000000000..dc28cb7c10df1d102afed439fc0b5212781110d0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/tcharge.h @@ -0,0 +1,35 @@ + +/******************************************************************************* +* +* File tcharge.h +* +* Copyright (C) 2010, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef TCHARGE_H +#define TCHARGE_H + +#ifndef SU3_H +#include "su3.h" +#endif + +/* FTCOM_C */ +extern void copy_bnd_ft(int n,u3_alg_dble *ft); +extern void add_bnd_ft(int n,u3_alg_dble *ft); + +/* FTENSOR_C */ +extern u3_alg_dble **ftensor(void); + +/* TCHARGE_C */ +extern double tcharge(void); +extern double tcharge_slices(double *qsl); + +/* YM_ACTION_C */ +extern double ym_action(void); +extern double ym_action_slices(double *asl); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/uflds.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/uflds.h new file mode 100644 index 0000000000000000000000000000000000000000..1ddcb1d850a37d7b711bcf89613beffd6b8ecefa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/uflds.h @@ -0,0 +1,42 @@ + +/******************************************************************************* +* +* File uflds.h +* +* Copyright (C) 2011, 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef UFLDS_H +#define UFLDS_H + +#ifndef SU3_H +#include "su3.h" +#endif + +/* BSTAP_C */ +extern su3_dble *bstap(void); +extern void set_bstap(void); + +/* PLAQ_SUM_C */ +extern double plaq_sum_dble(int icom); +extern double plaq_wsum_dble(int icom); +extern double plaq_action_slices(double *asl); + +/* SHIFT_C */ +extern int shift_ud(int *s); + +/* UFLDS_C */ +extern su3 *ufld(void); +extern su3_dble *udfld(void); +extern void random_ud(void); +extern void renormalize_ud(void); +extern void assign_ud2u(void); + +/* UDCOM_C */ +extern void copy_bnd_ud(void); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/update.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/update.h new file mode 100644 index 0000000000000000000000000000000000000000..bc9e519b308a3acdd2721533cabfac56a64bc921 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/update.h @@ -0,0 +1,66 @@ + +/******************************************************************************* +* +* File update.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef UPDATE_H +#define UPDATE_H + +#ifndef SU3_H +#include "su3.h" +#endif + +typedef struct +{ + int iop; + double eps; +} mdstep_t; + +/* CHRONO */ +extern void setup_chrono(void); +extern double mdtime(void); +extern void step_mdtime(double dt); +extern void add_chrono(int icr,spinor_dble *psi); +extern int get_chrono(int icr,spinor_dble *psi); +extern void reset_chrono(void); + +/* COUNTERS */ +extern void setup_counters(void); +extern void clear_counters(void); +extern void add2counter(char *type,int idx,int *status); +extern int get_count(char *type,int idx,int *status); +extern void print_avgstat(char *type,int idx); +extern void print_all_avgstat(void); + +/* MDSTEPS_C */ +extern void set_mdsteps(void); +extern mdstep_t *mdsteps(int *nop,int *itu); +extern void print_mdsteps(int ipr); + +/* MDINT_C */ +extern void run_mdint(void); + +/* HMC_C */ +extern void hmc_sanity_check(void); +extern void hmc_wsize(int *nwud,int *nws,int *nwsd,int *nwv,int *nwvd); +extern int run_hmc(double *act0,double *act1); + +/* RWRAT_C */ +extern double rwrat(int irp,int n,int *np,int *isp,double *sqn,int **status); + +/* RWTM_C */ +extern double rwtm1(double mu1,double mu2,int isp,double *sqn,int *status); +extern double rwtm2(double mu1,double mu2,int isp,double *sqn,int *status); + +/* RWTMEO_C */ +extern double rwtm1eo(double mu1,double mu2,int isp,double *sqn,int *status); +extern double rwtm2eo(double mu1,double mu2,int isp,double *sqn,int *status); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/utils.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..340f88854e3b583497f5366ead5c0982ce955baa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/utils.h @@ -0,0 +1,119 @@ + +/******************************************************************************* +* +* File utils.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef UTILS_H +#define UTILS_H + +#include +#include + +#ifndef SU3_H +#include "su3.h" +#endif + +#if ((DBL_MANT_DIG!=53)||(DBL_MIN_EXP!=-1021)||(DBL_MAX_EXP!=1024)) +#error : Machine is not compliant with the IEEE-754 standard +#endif + +#if (SHRT_MAX==0x7fffffff) +typedef short int stdint_t; +typedef unsigned short int stduint_t; +#elif (INT_MAX==0x7fffffff) +typedef int stdint_t; +typedef unsigned int stduint_t; +#elif (LONG_MAX==0x7fffffff) +typedef long int stdint_t; +typedef unsigned long int stduint_t; +#else +#error : There is no four-byte integer type on this machine +#endif + +#undef UNKNOWN_ENDIAN +#undef LITTLE_ENDIAN +#undef BIG_ENDIAN + +#define UNKNOWN_ENDIAN 0 +#define LITTLE_ENDIAN 1 +#define BIG_ENDIAN 2 + +#undef IMAX +#define IMAX(n,m) ((n)+((m)-(n))*((m)>(n))) + +typedef enum +{ + ALL_PTS,EVEN_PTS,ODD_PTS,NO_PTS,ODD_PTS2,PT_SETS +} ptset_t; + +/* ENDIAN_C */ +extern int endianness(void); +extern void bswap_int(int n,void *a); +extern void bswap_double(int n,void *a); + +/* MUTILS_C */ +extern int find_opt(int argc,char *argv[],char *opt); +extern int fdigits(double x); +extern void check_dir(char* dir); +extern void check_dir_root(char* dir); +extern int name_size(char *format,...); +extern long find_section(char *title); +extern long read_line(char *tag,char *format,...); +extern int count_tokens(char *tag); +extern void read_iprms(char *tag,int n,int *iprms); +extern void read_dprms(char *tag,int n,double *dprms); +extern int copy_file(char *in,char *out); + +/* UTILS_C */ +extern int safe_mod(int x,int y); +extern void *amalloc(size_t size,int p); +extern void afree(void *addr); +extern double amem_use_mb(void); +extern double amem_max_mb(void); +extern int mpi_permanent_tag(void); +extern int mpi_tag(void); +extern void error(int test,int no,char *name,char *format,...); +extern void error_root(int test,int no,char *name,char *format,...); +extern int error_loc(int test,int no,char *name,char *message); +extern void error_chk(void); +extern void message(char *format,...); +extern void mpc_bcast_c(char *buf, int num); +extern void mpc_bcast_d(double *buf, int num); +extern void mpc_bcast_i(int *buf, int num); +extern void mpc_gsum_d(double *src, double *dst, int num); +extern void mpc_print_info(void); + +/* WSPACE_C */ +extern void alloc_wud(int n); +extern su3_dble **reserve_wud(int n); +extern int release_wud(void); +extern int wud_size(void); +extern void alloc_wfd(int n); +extern su3_alg_dble **reserve_wfd(int n); +extern int release_wfd(void); +extern int wfd_size(void); +extern void alloc_ws(int n); +extern spinor **reserve_ws(int n); +extern int release_ws(void); +extern int ws_size(void); +extern void alloc_wsd(int n); +extern spinor_dble **reserve_wsd(int n); +extern int release_wsd(void); +extern int wsd_size(void); +extern void alloc_wv(int n); +extern complex **reserve_wv(int n); +extern int release_wv(void); +extern int wv_size(void); +extern void alloc_wvd(int n); +extern complex_dble **reserve_wvd(int n); +extern int release_wvd(void); +extern int wvd_size(void); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/version.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/version.h new file mode 100644 index 0000000000000000000000000000000000000000..221321d2080a6f80a0cf5da832ae38e1925041d7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/version.h @@ -0,0 +1,18 @@ + +/******************************************************************************* +* +* File version.h +* +* Copyright (C) 2009 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef VERSION_H +#define VERSION_H + +#define openQCD_RELEASE "openQCD-1.4" + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/vflds.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/vflds.h new file mode 100644 index 0000000000000000000000000000000000000000..e152b72556098c54434193516e36189f834e292e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/vflds.h @@ -0,0 +1,44 @@ + +/******************************************************************************* +* +* File vflds.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef VFLDS_H +#define VFLDS_H + +#ifndef SU3_H +#include "su3.h" +#endif + +/* VCOM_C */ +extern void cpv_int_bnd(complex *v); +extern void cpv_ext_bnd(complex *v); + +/* VDCOM_C */ +extern void cpvd_int_bnd(complex_dble *vd); +extern void cpvd_ext_bnd(complex_dble *vd); + +/* VFLDS_C */ +extern complex **vflds(void); +extern complex_dble **vdflds(void); + +/* VINIT_C */ +extern void set_v2zero(int n,complex *v); +extern void set_vd2zero(int n,complex_dble *vd); +extern void random_v(int n,complex *v,float sigma); +extern void random_vd(int n,complex_dble *vd,double sigma); +extern void assign_v2v(int n,complex *v,complex *w); +extern void assign_v2vd(int n,complex *v,complex_dble *wd); +extern void assign_vd2v(int n,complex_dble *vd,complex *w); +extern void assign_vd2vd(int n,complex_dble *vd,complex_dble *wd); +extern void add_v2vd(int n,complex *v,complex_dble *wd); +extern void diff_vd2v(int n,complex_dble *vd,complex_dble *wd,complex *w); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/wflow.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/wflow.h new file mode 100644 index 0000000000000000000000000000000000000000..5024ff4d47dec1132990b3fab6d8720a7347651b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/wflow.h @@ -0,0 +1,20 @@ +/******************************************************************************* +* +* File wflow.h +* +* Copyright (C) 2009, 2010, 2011, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef WFLOW_H +#define WFLOW_H + +/* WFLOW_C */ +extern void fwd_euler(int n,double eps); +extern void fwd_rk2(int n,double eps); +extern void fwd_rk3(int n,double eps); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..381f6e31121fb044cd105134f34d9429cf4bfc6c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/INDEX @@ -0,0 +1,29 @@ + +******************************************************************************** + + Simulations of QCD with Wilson quarks + +******************************************************************************** + +Simulation programs + +qcd1 HMC simulation program for QCD with Wilson quarks. + +ym1 HMC simulation program for the (pure) SU(3) gauge theory. + + +Measurement programs + +ms1 Measurement of reweighting factors. + +ms2 Computation of the spectral range of the hermitian + Dirac operator. + +ms3 Computation of Wilson flow observables. + +ms4 Computation of quark propagators. + + +Some examples of valid input parameter files can be found in the directory +./examples. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..7f97805f8ab8cb5e2be57c4fc35c658fe6d575a6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/Makefile @@ -0,0 +1,173 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines. Version valid +# for Linux machines with GNU make. +# +# "make" compiles and links the specified main programs and modules, using the +# specified libraries (if any), and produces the executables. +# +# "make clean" removes all files generated by "make". +# +# Dependencies on included files are automatically taken care of. +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = ym1 qcd1 ms1 ms2 ms3 ms4 + +ARCHIVE = archive sarchive + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +DFL = dfl_geometry dfl_subspace ltl_gcr dfl_sap_gcr dfl_modes + +DIRAC = Dw_dble Dw Dw_bnd + +FLAGS = flags action_parms dfl_parms force_parms hmc_parms lat_parms \ + mdint_parms rat_parms rw_parms sap_parms solver_parms + +FORCES = force0 force1 force2 force3 force4 force5 \ + frcfcts genfrc tmcg tmcgm xtensor + +LATTICE = bcnds uidx ftidx geometry + +LINALG = salg salg_dble valg valg_dble liealg cmatrix_dble cmatrix + +LINSOLV = cgne mscg fgcr fgcr4vd + +LITTLE = Aw_gen Aw_com Aw_ops Aw_dble Aw ltl_modes + +MDFLDS = mdflds fcom + +RANDOM = ranlux ranlxs ranlxd gauss + +RATFCTS = elliptic zolotarev ratfcts + +SAP = sap_com sap_gcr sap blk_solv + +SFLDS = sflds scom sdcom Pbnd Pbnd_dble + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +SW_TERM = pauli pauli_dble swflds sw_term + +TCHARGE = ftcom ftensor tcharge ym_action + +UFLDS = plaq_sum uflds udcom bstap + +UPDATE = chrono mdsteps counters mdint hmc rwtm rwtmeo rwrat + +UTILS = endian mutils utils wspace + +VFLDS = vflds vinit vcom vdcom + +WFLOW = wflow + +MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ + $(LATTICE) $(LINALG) $(LINSOLV) $(LITTLE) $(MDFLDS) $(RANDOM) \ + $(RATFCTS) $(SAP) $(SFLDS) $(SU3FCTS) $(SW_TERM) $(TCHARGE) \ + $(UFLDS) $(UPDATE) $(UTILS) $(VFLDS) $(WFLOW) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/mdflds:$(MDIR)/su3fcts:\ + $(MDIR)/utils:$(MDIR)/forces:$(MDIR)/sflds:$(MDIR)/dirac:\ + $(MDIR)/sw_term:$(MDIR)/tcharge:$(MDIR)/block:$(MDIR)/sap:\ + $(MDIR)/linsolv:$(MDIR)/dfl:$(MDIR)/vflds:$(MDIR)/little:\ + $(MDIR)/update:$(MDIR)/wflow:$(MDIR)/ratfcts + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +# debugging flags (add to CFLAGS if needed) + +# -DCGNE_DBG -DFGCR_DBG -FGCR4VD_DBG -DMSCG_DBG +# -DDFL_MODES_DBG -DMDINT_DBG -DRWRAT_DBG + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.global b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.global new file mode 100644 index 0000000000000000000000000000000000000000..af2293323dced4a360cb82fe925f915c54ca8527 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.global @@ -0,0 +1,337 @@ + +Header file global.h + +SYNTAX + +In the main program + +#define MAIN_PROGRAM +#include "global.h" + +In all other cases + +#include "global.h" + + +DESCRIPTION + +In this file the globally accessible constants, variables and arrays are +defined. It is here that the geometry of the lattice and its division into +processor sublattices is defined. + + +Lattice geometry +---------------- + +Currently the only constants that the user can specify are + + NPROC0 The processes are thought to be arranged in a hypercubic + NPROC1 grid with NPROC0,..,NPROC3 processes in direction 0,..,3. + NPROC2 If NPROCx=1 the lattice is not divided in direction x. + NPROC3 Otherwise NPROCx has to be even. + + L0 The local lattices are blocks of size L0xL1xL2xL3 that + L1 build up the full lattice in the obvious way. The sizes + L2 of the latter are thus (NPROC0*L0),..,(NPROC3*L3). It + L3 is assumed that L0,..,L3 are all even and at least 4. + + NPROC0_BLK The process grid is logically divided into hypercubic + NPROC1_BLK blocks of size NPROC0_BLK,..,NPROC3_BLK in direction + NPROC2_BLK 0,..,3. NPROCx_BLK must be greater or equal to 1 and + NPROC3_BLK NPROCx must be an integer multiple of NPROCx_BLK. + +The program verifies at compilation time that the values of these constants +are in the allowed range. See the section "MPI process ranking" below for +further explanation of the parameters NPROCx_BLK. + +All other macros that are defined in global.h are derived from these input +values. In particular + + NPROC Total number of processes. + + VOLUME Number of lattice points in the local lattice + [=L0*L1*L2*L3]. + +Independently of the boundary conditions imposed on the dynamical fields, the +lattice is considered to be a 4-dimensional torus. Depending on the process +numbers NPROC0,..,NPROC3, the local lattices can have non-empty boundaries on +up to 8 sides. A two-dimensional sketch of the situation is + + + + + + + + + * Volume points = the true + + * * * * * * * + local lattice. + + * * * * * * * + + + * * * * * * * + + Exterior boundary points = + + * * * * * * * + copies of the corresponding + + * * * * * * * + points of the local lattices + + * * * * * * * + on the neighbouring processes. + + * * * * * * * + + + * * * * * * * + + + + + + + + + + +Note that there is no boundary in direction x if NPROCx=1, since the exterior +boundary points in that direction coincide, in this case, with the interior +boundary points on the opposite side of the local lattice. The numbers of +exterior boundary points in direction 0,1,2,3 and the total number of boundary +points are + + FACE0 + FACE1 + FACE2 + FACE3 + + BNDRY = 2*(FACE0+FACE1+FACE2+FACE3) + +where, by definition, FACEx=0 if NPROCx=1. The boundaries of the local lattice +are labeled such that the face in direction -0 has label 0, the face in +direction +0 has label 1, the face in direction -1 has label 2, and so on. + +The global arrays that define the process grid are + + int cpr[4] Cartesian coordinates of the local process. + + int npr[8] Process ids of the 8 processes that operate on the 8 + neighbouring lattices of the local lattice. Explicitly, + npr[2*mu] is the id of the process in direction -mu and + npr[2*mu+1] the same in direction +mu. + +The global arrays that define the lattice geometry are + + int ipt[VOLUME] ipt[x3+L3*x2+L2*L3*x1+L1*L2*L3*x0] is the index of the + point on the local lattice with Cartesian coordinates + (x0,x1,x2,x3), where the coordinate x0 ranges from 0 + to L0-1, x1 from 0 to L1-1, and so on. + + int iup[VOLUME][4] iup[ix][mu] is the index of the nearest neighbour + point in the positive ("up") direction mu of the + point on the local lattice with index ix. If the + nearest neighbour point is on the boundary of the + lattice, the index iy=iup[ix][mu] is in the range + VOLUME<=iy [-noexp] [-a [-norng]] + + +DESCRIPTION + +This program reads gauge field configurations from disk and computes +stochastic estimates of reweighting factors. + + +COMMAND-LINE OPTIONS + +The program has only few options since most of the parameters are passed +through an input file. The options are + +-i Specifies the name of the input file. The name can be + a fully qualified name or be specified relative to the + working directory. + +-noexp Field configurations are normally read in exported + file format from the specified configuration directory. + If this option is set, the configurations are instead + expected in the imported file format on the local disks. + +-a This option specifies that the run is a continuation of + a previous run. All output data are appended to the + previous output files. + +-norng Continuation runs normally start from the saved state + of the random number generators. This option specifies + that the traditional initialization of the generators is + to be used (see section RANDOM NUMBER GENERATOR below). + NOTE: starting from the saved state is not possible if + the process grid sizes NPROC0,..,NPROC3 are changed. + + +INPUT PARAMETERS + +The lattice size and the process grid must be defined in the file global.h +(see README.global). All other parameters are read from the input file. An +example of a valid input file is ms1.in in this directory. The parameter +values specified in this file are: + +[Run name] +name Snoopy137 # Run name = configuration base name + +[Directories] +log_dir ../data/ms1/log # Log file directory +dat_dir ../data/ms1/dat # Data file directory +loc_dir /ndata/qcd1/cnfg # Local configuration directory +cnfg_dir /data/qcd1/cnfg # Exported configuration directory + +[Configurations] +first 1 # No of the first configuration to consider +last 4 # No of the last configuration +step 1 # Configuration separation (last-first must + # be an integer multiple of step) +nrw 2 # Number of reweighting factors to be + # computed in this run + +[Random number generator] +level 0 # Ranlux level +seed 73099 # Ranlux seed + +[Lattice parameters] +kappa 0.1300 0.1290 # List of sea-quark hopping parameters +csw 1.234 # Coefficient of the SW term in the + # Dirac operator + +[Boundary conditions] +type 2 # Type of boundary condition (0: open, + # 1: SF, 2: open-SF, 3: periodic) +phi 0.12 -0.56 # Boundary values of the gauge field at + # time 0 +phi' 0.92 0.76 # Boundary values of the gauge field at + # time NPROC0*L0 +cF 0.95 # Fermion action improvement coefficient + # at time 0 +cF' 0.90 # Fermion action improvement coefficient + # at time NPROC0*L0 + +Then follows a description of the reweighting factors labeled by an index that +runs from 0 to nrw-1 (see flags/rw_parms.c). The available reweighting factors +and associated parameter sections are described in the file doc/parms.pdf (see +the top of the modules update/rwtm.c, update/rwtmeo.c and update/rwrat.c for +further explanations). + +Reweighting factors of type RWRAT require a choice of a rational function. The +solvers to be used need to be specified too (see doc/parms.pdf). + +Superfluous sections and parameters may be deleted or commented out. If +present they are not read by the program and the specified values (if any) +have no effect on the run. As already mentioned, the indices of the parameter +sections describing the reweighting factors must increase in steps of 1 from 0 +to nrw-1. The indices of the solver sections can be freely chosen in the range +0,..,31. + + +FILES + +The program searches for exported field configurations + + n + +in the directory cnfg_dir, where is the configuration number. +Imported configurations + + n_0 (on process 0) + n_1 (on process 1) + n_2 (on process 2) + ... ... + +are searched in the directory loc_dir. + +The program writes the results of the computations to the files + + .ms1.log Log file + .ms1.log~ Backup log file + + .ms1.dat Measurement data file + .ms1.dat~ Backup data file + + .ms1.par Parameter data file + .ms1.par~ Backup parameter data file + + .ms1.rng Exported state of the random number generators + .ms1.rng~ Backup random number generator state file + +in the directories log_dir (log file) and dat_dir (data files). The parameter +file is created at the beginning of the run and remains unchanged after that. +The backup copies *.log~, *.dat~ and *.rng~ of the *.log, *.dat and *.rng file +are updated each time a configuration is fully processed. + +The directories log_dir and dat_dir, as well as the directory cnfg_dir if the +option -noexp is not set, must be accessible from process 0. If the -noexp +option is set, the directory loc_dir must be accessible from all processes. + + +OUTPUT DATA + +At the beginning of the data file the program writes the data contained in the +header structure + +static struct +{ + int nrw; + int *nfct,*nsrc; +} file_head; + +where nrw is the number of reweighting factors specified in the input file, +nfct[0],..,nfct[nrw-1] the array of the associated numbers of Hasenbusch +factors (set to 1 for RWRAT reweighting factors) and nsrc[0],..,nsrc[nrw-1] +the array of the associated numbers N of source fields. + +After the header data, the data file contains a sequence of structures + +static struct +{ + int nc; + double ***sqn,***lnr; +} data; + +labeled by the field configuration number nc. For each configuration, the data +are + + sqn[irw][ifct][isrc] Square norm of the source field number isrc + generated in the course of the calculation of + the factor number ifct of the reweighting factor + number irw. + + lnr[irw][ifct][isrc] The logarithm, -ln(r), of the associated stochastic + estimate of the reweighting factor r (irw=0,..,nrw-1, + ifct=0,..,nfct[irw-1], isrc=0,..,nsrc[irw]-1). + +See the functions write_file_head() and write_data() in the file ms1.c for the +exact order in which the data are written to the output files. + +From these data, the stochastic estimates W[irw] of the reweighting factor +number irw are obtained by calculating the averages + + w[irw][ifct]= + + (1/nsrc[irw])*sum_{isrc=0}^{nsrc[irw]-1} exp{-lnr[irw][ifct][isrc]} + +and the product + + W[irw]=prod_{ifct=0}^{nfct[irw]-1} w[irw][ifct] + +A simple main program that reads and analyses the data files is included in +the directory ../devel/nompi/main. + + +BINARY FILE FORMAT + +The log files are ASCII files that should be readable on any machine. The +data files, on the other hand, are written in binary format using the fwrite() +function. Integers are written as 4 byte signed integers and floating-point +numbers according to the IEEE-754 standard for double-precision numbers. + +Binary data written to disk are converted to little endian byte order if the +machine is big endian. Field configurations and measurement data stored on +disk are thus always in little endian byte order independently of the machine +that is used. + + +RANDOM NUMBER GENERATOR + +Random numbers are generated using the ranlux generator. Depending on the +context, either single- or double-precision random numbers are generated. The +initialization of the generator is as follows: + +- In the case of a new run, the program reads the parameters "level" and + "seed" from the input file and uses these to initialize the generator. + +- Continuation runs do the following: + + o If the option -norng is set, the parameters "level" and "seed" are read + from the input parameter file and the generator is initialized using + "seed"^n (bitwise exclusive or) as the seed value, where n is the number + of the last field configuration saved in the previous run. + + o Otherwise the state of the generator is read from the file + .ms1.rng. The generator is thus reset to the state it had at the + end of the previous run. Note that the process grid NPROC0x..xNPROC3 must + be unchanged in this case from one run to the next (an error occurs if it + is not). + +In a sequence of continuation runs, it is therefore recommended to leave the +process grid unchanged and to make no use of the option -norng. If the process +grid is changed at some point, the next run must start from an exported field +configuration and the option -norng must be set. In all cases, the parameters +"level" and "seed" on the input parameter file may be left unchanged. + +Note that if the configurations are read in imported form, the state of the +generator is *not* set to the one stored on the configuration file. The +generated random numbers, and consequently the computed reweighting factors, +are therefore independent of whether the configurations are read in imported +or exported form. + + +SAFETY MEASURES AND ERROR REPORTING + +A number of safety measures have been implemented: + +- It is not possible to overwrite an existing log or data file; these + must first be deleted or renamed by hand if a run is to be repeated. + +- Appending a run to a previous measurement run is only possible if the run + name and all relevant parameters match. Moreover, the new configuration + sequence must extend the previous one with the same configuration spacing. + +- The accessibility of the various directories and the compatibility + of the chosen parameters is checked at the beginning of the program. + +Any attempt to force illegal operations leads to an abnormal termination of +the program, with an informative message being written either to the log file +or the file STARTUP_ERROR in the program directory (if the error occurs before +the log file is opened). + +It should be noted that filenames may not be longer than 127 characters. The +program checks at an early stage whether this is the case or not. Longer +filenames can be accommodated by setting the macro NAME_SIZE in the header +file global.h to a larger value. + + +CHECKPOINTS AND EARLY TERMINATION + +The program can be stopped gracefully by touching a file in the log directory +with the same name as the log file but with extension .end instead of .log. It +may take a while until the program exits, because it will only do so when the +current field configuration is fully processed. + +If the machine crashes, or if the program was stopped in the way described, +the run can always be continued starting from the saved output files. However, +after a crash, the log and data files may be corrupted, in which case they +must first be restored from the backup files. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ms2 b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ms2 new file mode 100644 index 0000000000000000000000000000000000000000..1874fd3774aa4f12777e7d844e51e942e98b9bac --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ms2 @@ -0,0 +1,140 @@ + +Main program ms2 + +SYNTAX + +ms2 -i [-noexp] + + +DESCRIPTION + +This program reads gauge field configurations from disk and estimates the +spectral range [ra,rb] of the even-odd preconditioned hermitian Dirac operator +(Dwhat^dagger*Dwhat)^(1/2) using the power method. A summary of results is +printed to the log file together with a table of suggested parameters of the +Zolotarev rational approximation for the operator (Dwhat^dagger*Dwhat)^(-1/2). + + +COMMAND-LINE OPTIONS + +The program has only few options since most of the parameters are passed +through an input file. The options are + +-i Specifies the name of the input file. The name can be + a fully qualified name or be specified relative to the + working directory. + +-noexp Field configurations are normally read in exported + file format from the specified configuration directory. + If this option is set, the configurations are instead + expected in the imported file format on the local disks. + + +INPUT PARAMETERS + +The lattice size and the process grid must be defined in the file global.h +(see README.global). All other parameters are read from the input file. An +example of a valid input file is ms2.in in this directory. The parameter +values specified in this file are: + +[Run name] +name Snoopy137 # Run name = configuration base name + +[Directories] +log_dir ../data/ms2/log # Log file directory +loc_dir /ndata/qcd1/cnfg # Local configuration directory +cnfg_dir /data/qcd1/cnfg # Exported configuration directory + +[Configurations] +first 1 # No of the first configuration to consider +last 4 # No of the last configuration +step 1 # Configuration separation (last-first must + +[Dirac operator] +kappa 0.1300 # Hopping parameter +csw 1.234 # Coefficient of the SW term + +[Boundary conditions] +type 2 # Type of boundary condition (0: open, + # 1: SF, 2: open-SF, 3: periodic) +phi 0.12 -0.56 # Boundary values of the gauge field at + # time 0 +phi' 0.92 0.76 # Boundary values of the gauge field at + # time NPROC0*L0 +cF 0.95 # Fermion action improvement coefficient + # at time 0 +cF' 0.90 # Fermion action improvement coefficient + # at time NPROC0*L0 + +[Power method] +np_ra 20 # Number of power iterations to be + # applied when estimating ra +np_rb 100 # Number of power iterations to be + # applied when estimating rb + +Then follows a description of the solver for the Dirac equation to be used in +the course of the inverse power iterations (see doc/parms.pdf). The supported +solvers are CGNE, SAP_GCR and DFL_SAP_GCR. + +Superfluous sections and parameters may be deleted or commented out. If +present they are not read by the program and the specified values (if any) +have no effect on the run. The solver index must be set to 0. + + +FILES + +The program searches for exported field configurations + + n + +in the directory cnfg_dir, where is the configuration number. +Imported configurations + + n_0 (on process 0) + n_1 (on process 1) + n_2 (on process 2) + ... ... + +are searched in the directory loc_dir. + +The program prints the results of the computations to the files + + .ms2.log Log file + .ms2.log~ Backup log file + +in the directory log_dir. The backup file is updated each time a configuration +is fully processed. + +The directory log_dir, as well as the directory cnfg_dir if the -noexp option +is not set, must be accessible from process 0. If the -noexp option is set, +the directory loc_dir must be accessible from all processes. + + +SAFETY MEASURES AND ERROR REPORTING + +A number of safety measures have been implemented: + +- It is not possible to overwrite an existing log file. The file + must first be deleted or renamed if a run is to be repeated. + +- The accessibility of the various directories and the compatibility + of the chosen parameters is checked at the beginning of the program. + +Any attempt to force illegal operations leads to an abnormal termination of +the program, with an informative message being written either to the log file +or the file STARTUP_ERROR in the program directory (if the error occurs before +the log file is opened). + +It should be noted that filenames may not be longer than 127 characters. The +program checks at an early stage whether this is the case or not. Longer +filenames can be accommodated by setting the macro NAME_SIZE in the header +file global.h to a larger value. + + +EARLY TERMINATION + +The program can be stopped gracefully by touching a file in the log directory +with the same name as the log file but with extension .end instead of .log. It +may take a while until the program exits, because it will only do so when the +current field configuration is fully processed. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ms3 b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ms3 new file mode 100644 index 0000000000000000000000000000000000000000..6245f39d3a15ebd9403bd419e5722d3433ea6297 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ms3 @@ -0,0 +1,200 @@ + +Main program ms3 + +SYNTAX + +ms3 -i [-noexp] [-a] + + +DESCRIPTION + +This program reads gauge field configurations from disk and computes +a set of Wilson flow observables. + + +COMMAND-LINE OPTIONS + +The program has only few options since most of the parameters are passed +through an input file. The options are + +-i Specifies the name of the input file. The name can be + a fully qualified name or be specified relative to the + working directory. + +-noexp Field configurations are normally read in exported + file format from the specified configuration directory. + If this option is set, the configurations are instead + expected in the imported file format on the local disks. + +-a This option specifies that the run is a continuation of + a previous run. All output data are appended to the + previous output files. + + +INPUT PARAMETERS + +The lattice size and the process grid must be defined in the file global.h +(see README.global). All other parameters are read from the input file. An +example of a valid input file is ms3.in in this directory. The parameter +values specified in this file are: + +[Run name] +name Snoopy137 # Run name = configuration base name + +[Directories] +log_dir ../data/ms3/log # Log file directory +dat_dir ../data/ms3/dat # Data file directory +loc_dir /ndata/qcd1/cnfg # Local configuration directory +cnfg_dir /data/qcd1/cnfg # Exported configuration directory + +[Configurations] +first 1 # No of the first configuration to consider +last 4 # No of the last configuration +step 1 # Configuration separation (last-first must + # be an integer multiple of step) + +[Boundary conditions] +type 2 # Type of boundary condition (0: open, + # 1: SF, 2: open-SF, 3: periodic) +phi 0.12 -0.56 # Boundary values of the gauge field at + # time 0 +phi' 0.92 0.76 # Boundary values of the gauge field at + # time NPROC0*L0 + +[Wilson flow] +integrator RK3 # EULER: Euler, RK2: 2nd order Runge-Kutta + # RK3: 3rd order Runge-Kutta +eps 2.0e-2 # Integration time step size +nstep 100 # Number of integration steps +dnms 10 # Number of integration steps between + # observable measurements + +Superfluous parameters may be deleted or commented out. If present they are +not read by the program and the specified values (if any) have no effect on +the run. + +FILES + +The program searches for exported field configurations + + n + +in the directory cnfg_dir, where is the configuration number. +Imported configurations + + n_0 (on process 0) + n_1 (on process 1) + n_2 (on process 2) + ... ... + +are searched in the directory loc_dir. + +The program writes the results of the computations to the files + + .ms3.log Log file + .ms3.log~ Backup log file + + .ms3.dat Measurement data file + .ms3.dat~ Backup data file + + .ms3.par Parameter data file + .ms3.par~ Backup parameter data file + +in the directories log_dir (log file) and dat_dir (data files). The parameter +file is created at the beginning of the run and remains unchanged after that. +The backup log and data files are updated each time a configuration is fully +processed. + +The directories log_dir and dat_dir, as well as the directory cnfg_dir if the +-noexp option is not set, must be accessible from process 0. If the -noexp +option is set, the directory loc_dir must be accessible from all processes. + + +OUTPUT DATA + +For each configuration, the Wilson flow is integrated from flow time 0 to time +"nstep"*"eps" in steps of eps using the specified integrator. After every +"dnms" integration steps, the time-slice sums of the densities of the Wilson +plaquette action, the Yang-Mills action and the topological charge are +computed (see uflds/plaq_sum.c, tcharge/ftensor.c, tcharge/ym_action.c and +tcharge/tcharge.c). + +At the beginning of the measurement data file the program writes the data +contained in the header structure + +static struct +{ + int dn,nn,tmax; + double eps; +} file_head; + +where dn="dnms", nn="nstep"/"dnms" and tmax=NPROC0*L0. After the header data, +the data file contains a sequence of data structures + +static struct +{ + int nc; + double **Wsl,**Ysl,**Qsl; +} data; + +labeled by the configuration number nc. In each case the time-slice sums of +the densities of the Wilson plaquette action, the Yang-Mills action and the +topological charge are written to the arrays + + Wsl[in][t] (in=0,..,nn, t=0,..,tmax-1) + Ysl[in][t] + Qsl[in][t] + +See the functions write_file_head() and write_data() in the program file +ms3.c for the exact order in which the data are written to the output files. + + +BINARY FILE FORMAT + +The log files are ASCII files that should be readable on any machine. The +data files, on the other hand, are written in binary format using the fwrite() +function. Integers are written as 4 byte signed integers and floating-point +numbers according to the IEEE-754 standard for double-precision numbers. + +Binary data written to disk are converted to little endian byte order if the +machine is big endian. Field configurations and measurement data stored on +disk are thus always in little endian byte order independently of the machine +that is used. + + +SAFETY MEASURES AND ERROR REPORTING + +A number of safety measures have been implemented: + +- It is not possible to overwrite an existing log or data file; these + must first be deleted or renamed by hand if a run is to be repeated. + +- Appending a run to a previous measurement run is only possible if the run + name and all relevant parameters match. Moreover, the new configuration + sequence must extend the previous one with the same configuration spacing. + +- The accessibility of the various directories and the compatibility + of the chosen parameters is checked at the beginning of the program. + +Any attempt to force illegal operations leads to an abnormal termination of +the program, with an informative message being written either to the log file +or the file STARTUP_ERROR in the program directory (if the error occurs before +the log file is opened). + +It should be noted that filenames may not be longer than 127 characters. The +program checks at an early stage whether this is the case or not. Longer +filenames can be accommodated by setting the macro NAME_SIZE in the header +file global.h to a larger value. + + +CHECKPOINTS AND EARLY TERMINATION + +The program can be stopped gracefully by touching a file in the log directory +with the same name as the log file but with extension .end instead of .log. It +may take a while until the program exits, because it will only do so when the +current field configuration is fully processed. + +If the machine crashes, or if the program was stopped in the way described, +the run can always be continued starting from the saved output files. However, +after a crash, the log and data files may be corrupted, in which case they +must first be restored from the backup files. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ms4 b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ms4 new file mode 100644 index 0000000000000000000000000000000000000000..c00dbf84a40f5dae8a92c3d02f2b1c373bd5bad4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ms4 @@ -0,0 +1,201 @@ + +Main program ms4 + +SYNTAX + +ms4 -i [-noexp] + + +DESCRIPTION + +This program reads gauge field configurations from disk and computes the quark +propagator for a number of gaussian random source fields at a specified time +x0. The calculated propagators are exported to disk in a universal format (see +archive/sarchive.c). The program has a very limited functionality and serves +for illustration only. + + +COMMAND-LINE OPTIONS + +The program has only few options since most of the parameters are passed +through an input file. The options are + +-i Specifies the name of the input file. The name can be + a fully qualified name or be specified relative to the + working directory. + +-noexp Field configurations are normally read in exported + file format from the specified configuration directory. + If this option is set, the configurations are instead + expected in the imported file format on the local disks. + + +INPUT PARAMETERS + +The lattice size and the process grid must be defined in the file global.h +(see README.global). All other parameters are read from the input file. An +example of a valid input file is ms4.in in this directory. The parameter +values specified in this file are: + +[Run name] +name Snoopy137 # Run name = configuration base name + +[Directories] +log_dir ../data/ms4/log # Log file directory +loc_dir /ndata/qcd1/cnfg # Local configuration directory +cnfg_dir /data/qcd1/cnfg # Exported configuration directory +sfld_dir /data/ms4/sfld # Exported propagator directory + +[Configurations] +first 1 # No of the first configuration to consider +last 4 # No of the last configuration +step 1 # Configuration separation (last-first must + # be an integer multiple of step) + +[Random number generator] +level 0 # Ranlux level +seed 73099 # Ranlux seed + +[Dirac operator] +kappa 0.1300 # Hopping parameter +mu 0.001 # Twisted mass +csw 1.234 # Coefficient of the SW term + +[Boundary conditions] +type 2 # Type of boundary condition (0: open, + # 1: SF, 2: open-SF, 3: periodic) +phi 0.12 -0.56 # Boundary values of the gauge field at + # time 0 +phi' 0.92 0.76 # Boundary values of the gauge field at + # time NPROC0*L0 +cF 0.95 # Fermion action improvement coefficient + # at time 0 +cF' 0.90 # Fermion action improvement coefficient + # at time NPROC0*L0 + +[Source fields] +x0 20 # Time at which the random source fields + # live (0<=x0n + +in the directory cnfg_dir, where is the configuration number. +Imported configurations + + n_0 (on process 0) + n_1 (on process 1) + n_2 (on process 2) + ... ... + +are searched in the directory loc_dir. + +The program prints some information on the progress of the computations +to the files + + .ms4.log Log file + .ms4.log~ Backup log file + +in the directory log_dir. The backup file is updated each time a configuration +is fully processed. + +The calculated solutions of the Dirac equation are stored in the files + + n.s0 (source no 0) + n.s1 (source no 1) + n.s2 (source no 2) + ... ... + +in the directory sfld_dir (nsrc files per gauge field configuration). These +files can be read using the program import_sfld() [archive/sarchive.c]. + +The directories log_dir and sfld_dir, as well as the directory cnfg_dir if the +-noexp option is not set, must be accessible from process 0. If the -noexp +option is set, the directory loc_dir must be accessible from all processes. + + +SOLVER PERFORMANCE + +The program prints the time required for the solution of the Dirac equation to +the log file. When selecting the solver, one should take into account that the +CGNE solver tends to be very slow at small quark masses. In the case of the +GCR solvers, the performance may be poor when the twisted quark mass mu is +larger than, say, 0.1 and much larger than the ordinary quark mass. The use of +the deflated solver is recommended if both masses are small. + +The processing times per gauge field configuration quoted in the log file +include the time required for the I/O operations. + + +BINARY FILE FORMAT + +The *.log files are ASCII files that should be readable on any machine. Data +and configuration files, on the other hand, are written in binary format using +the fwrite() function. Integers are written as 4 byte signed integers and +floating-point numbers according to the IEEE-754 standard for double-precision +numbers. + +Binary data written to disk are converted to little endian byte order if the +machine is big endian. Field configurations and measurement data stored on +disk are thus always in little endian byte order independently of the machine +that is used. + + +RANDOM NUMBER GENERATOR + +Random numbers are generated using the ranlux generator. The generator is +initialized using the values of the parameters "level" and "seed" specified in +the input file. If the configurations are read in imported form, the state of +the generator is *not* set to the one stored on the configuration file. The +generated random numbers, and consequently the random source fields, are +therefore independent of whether the configurations are read in imported or +exported form. + + +SAFETY MEASURES AND ERROR REPORTING + +A number of safety measures have been implemented: + +- It is not possible to overwrite an existing log file. This file + must first be deleted or renamed if a run is to be repeated. + +- The accessibility of the various directories and the compatibility + of the chosen parameters is checked at the beginning of the program. + +Any attempt to force illegal operations leads to an abnormal termination of +the program, with an informative message being written either to the log file +or the file STARTUP_ERROR in the program directory (if the error occurs before +the log file is opened). + +On the other hand, once a run started successfully, the calculated propagators +are saved unconditionally, i.e. any existing propagator files with matching +filenames are overwritten. + +It should be noted that filenames may not be longer than 127 characters. The +program checks at an early stage whether this is the case or not. Longer +filenames can be accommodated by setting the macro NAME_SIZE in the header +file global.h to a larger value. + + +EARLY TERMINATION + +The program can be stopped gracefully by touching a file in the log directory +with the same name as the log file but with extension .end instead of .log. It +may take a while until the program exits, because it will only do so when the +current field configuration is fully processed. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.qcd1 b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.qcd1 new file mode 100644 index 0000000000000000000000000000000000000000..fa97c8ecc0b042026f6facb413bd4a918c8cae3e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.qcd1 @@ -0,0 +1,466 @@ + +Main program qcd1 + +SYNTAX + +qcd1 -i [-noloc] [-noexp] [-rmold] [-noms] + [-c [-a [-norng]]] + + +DESCRIPTION + +This program generates an ensemble of gauge fields representative of QCD with +a multiplet of Wilson quarks. Exactly which theory is simulated depends on the +parameters passed to the program. Moreover, one has a choice of boundary +conditions in time (open, SF, open-SF and periodic). The simulation is based +on a version of the HMC algorithm, which can be tuned in many ways via the +input parameters. + +In the course of the simulation, the average plaquette is measured +and the generated gauge field configurations are written out to files in +specified directories. Further observables, constructed using the Wilson flow, +are measured as well. + + +COMMAND-LINE OPTIONS + +The program has only few options since most of the parameters are passed +through an input file. The options are + +-i Specifies the name of the input file. The name can be + a fully qualified name or be specified relative to the + working directory. + +-noloc Normally the generated configurations are saved to the + local disks on the nodes of the machine. In addition they + are exported from process 0 using the export function (see + archive/archive.c). Initial configurations can be imported + or be read from the local disks. This option specifies that + the local disks should not be used. + +-noexp Do not export the generated field configurations. + +-rmold Remove old configurations and keep only the one which + was last saved to disk. The initial configuration + given on the command line is not removed unless the + -a option is set. + +-noms Do not measure any observables except for the average + plaquette. + +-c This option instructs the program to read the initial + gauge-field configuration from the specified file. The + file must be in one of the configuration directories + listed in the input file and its name must be of the form + described below. The run starts from a random gauge-field + configuration if this option is omitted. + +-a This option specifies that the run is a continuation of a + previous run. The -c option is required in this + case and must point to the last configuration saved by + the previous run. All output data are appended to the + previous output files. + +-norng Continuation runs normally start from the saved state + of the random number generators. This option specifies + that the traditional initialization of the generators is + to be used (see section RANDOM NUMBER GENERATOR below). + NOTE: starting from the saved state is not possible if + the process grid sizes NPROC0,..,NPROC3 are changed. + +The concurrent use of the options -noloc and -noexp (in which case the +generated configurations would not be saved anywhere) is considered to be an +error and is not permitted. In a sequence of continuation runs, the -noms +option must be set in either all or none of the runs. + + +INPUT PARAMETERS + +The lattice size and the process grid must be defined in the file global.h +(see README.global). All other parameters are read from the input file and the +command line. An example of a valid input file is qcd1.in in this directory. +The parameter values specified in this file are: + +[Run name] +name Snoopy137 # Run name = configuration base name + +[Directories] +log_dir ../data/qcd1/log # Log file directory +dat_dir ../data/qcd1/dat # Data file directory +loc_dir /ndata/qcd1/cnfg # Local configuration directory +cnfg_dir /data/qcd1/cnfg # Exported configuration directory + +[Lattice parameters] +beta 6.0 # Inverse gauge coupling +c0 1.6667 # Coefficient of the plaquette term + # in the gauge action +kappa 0.1300 # List of sea-quark hopping parameters +csw 1.234 # Coefficient of the SW term in the + # Dirac operator + +[Boundary conditions] +type 2 # Type of boundary condition (0: open, + # 1: SF, 2: open-SF, 3: periodic) +phi 0.12 -0.56 # Boundary values of the gauge field at + # time 0 +phi' 0.92 0.76 # Boundary values of the gauge field at + # time NPROC0*L0 +cG 1.10 # Gauge action improvement coefficient at + # time 0 +cG' 1.05 # Gauge action improvement coefficient at + # time NPROC0*L0 +cF 0.95 # Fermion action improvement coefficient + # at time 0 +cF' 0.90 # Fermion action improvement coefficient + # at time NPROC0*L0 + +[Random number generator] +level 0 # Ranlux level +seed 73099 # Ranlux seed + +[HMC parameters] +actions 0 1 2 # Gauge and pseudo-fermion actions included + # in the simulation +npf 2 # Number of pseudo-fermion fields +mu 0.01 1.0 # List of twisted-mass parameters +nlv 3 # Number of levels of the integrator for + # the molecular-dynamics (MD) equations +tau 0.5 # MD trajectory length + +[MD trajectories] +nth 320 # Number of thermalization trajectories +ntr 32000 # Total number of trajectories +dtr_log 4 # Separation of log entries +dtr_ms 8 # Separation of measurements +dtr_cnfg 32 # Separation of configuration saves + +Then follow the parameters of the integrator levels, the actions, the MD +forces and the solvers for the Dirac equation. Their format is described in +the file doc/parms.pdf and on top of the modules + +flags/mdint_parms.c +flags/action_parms.c +flags/force_parms.c +flags/solver_parms.c + +The integrator levels are labeled from 0 (innermost level, usually including +the force deriving from the chosen gauge action) to nlv-1 (outermost level). +Action, force and solver labels must be integers but may otherwise be chosen +arbitrarily. There must be a section for all actions, forces and solvers used, +and for each action section there must be a corresponding force section with +the same label. + +Finally, if measurements using the Wilson flow are to be made, the section + +[Wilson flow] +integrator RK3 # EULER: Euler, RK2: 2nd order Runge-Kutta + # RK3: 3rd order Runge-Kutta +eps 2.0e-2 # Integration step size +nstep 100 # Total number of integration steps +dnms 10 # Number of steps between measurements + +is required. + +The chosen parameter values must satisfy the following constraints: + +- "nth" and "ntr" must be integer multiples of "dtr_cnfg". + +- "nth" must be equal to zero in a continuation run (option -a). + +- "dtr_cnfg" must be a multiple of "dtr_log". + +- "dtr_cnfg" must be a multiple of "dtr_ms" and the latter must be + a multiple of "dtr_log". + +- The number "nstep" of Wilson flow integration steps must be a multiple + of "dnms". + +Depending on the specified options, the values of some parameters are ignored. +In particular, + +- "loc_dir" is not used if the -noloc option is set. + +- "cnfg_dir" is not used if -noexp is set and if the starting + configuration is not of the exported configuration type. + +- "lambda" is only required if the 2nd order OMF integrator is used. + +- The section "Wilson flow" and the parameter "dtr_ms" can be omitted + if the -noms option is set. + +Superfluous sections and parameters may be deleted or commented out. If +present they are not read by the program and have no effect on the run. In +particular, the constraints mentioned above involving these parameters need +not be satisfied. + + +INITIAL FIELD CONFIGURATION + +The initial field configuration specified on the command line with the -c +option can be in imported or exported form (see archive/archive.c). In the +case of imported configurations, each MPI process reads a file of the form + + _ + +where is the process number. On the command line, imported and exported +configurations are distinguished by an asterix (*) like + + * Imported configuration + + Exported configuration + +where it goes without saying that the string must not contain an +asterix at its end. + +Configurations in imported form are read from the directory loc_dir on the +local disks of the machine. The sizes of the current lattice and those read +from the files must be the same in this case. + +If the configuration is in exported form, it is read from the directory +cnfg_dir on a disk accessible from process 0. The sizes of the current lattice +need not be the same as those read from the configuration file, but must be +integer multiples of the latter. The field is periodically extended if the +lattice sizes do not match (see archive/archive.c for further explanations). + + +FILES + +The program writes the results of the calculations to the files + + .log Log file + .par Parameter file + .dat Data file + .ms.dat Measurement data file + .rng Exported state of the random number generators + + .log~ Backup log file + .par~ Backup parameter file + .dat~ Backup data file + .ms.dat~ Backup measurement data file + .rng~ Backup random number generator state file + + n3_0 Imported configuration file written by process 0 + n3_1 Imported configuration file written by process 1 + n3_2 Imported configuration file written by process 2 + ..... ..... + + n3 Exported configuration file + +Here n3 identifies configuration number 3. The directories in which these +files are stored are the ones specified in the input file. + +The directories "log_dir", "dat_dir" and "cnfg_dir" must be accessible from +process 0, while each process must be able to access the directory "loc_dir" +(unless the option -noloc is set). The "loc_dir" directory seen from different +processes may or may not be physically the same. + +Configurations are saved after the first "nth" trajectories and then after +every "dtr_cnfg" trajectories. The backup copies *.log~, *.dat~ and *.rng~ of +the *.log, *.dat and *.rng files are created each time a new configuration is +saved to disk. + +The parameter file *.par is created when a new run is started. It contains all +relevant lattice and run parameters in binary form. Continuation runs read the +file and check whether the parameter values match those read from the input +file. If a mismatch is discovered, the program is halted and an error message +is printed to the file STARTUP_ERROR in the program directory. + + +EXAMPLES + +The command + + qcd1 -i qcd1.in -c * + +starts a new run from the specified configuration which is searched for +in the "loc_dir" directory on the local disks of the machine. If instead the +run should be a continuation run, starting from the last configuration of a +previous run, the command would be + + qcd1 -i qcd1.in -c n3* -a + +In this case the *.log, *.par, *.dat and *.rng files of the previous run must +be found in the directories "log_dir" and "dat_dir", respectively. Using these +files, and the configuration name given on the command line, a number of +checks are performed to ensure that the run is indeed a continuation of the +previous one. + +In these two examples, the configuration filenames could also be and +n3 (i.e. without a "*") in which case the program assumes that the +configuration is an exported one. The configuration is then searched for in +the directory "cnfg_dir" by process 0 only. If the -c option is omitted, the +gauge field variables are set to uniformly distributed random SU(3) matrices. + + +RUN DATA + +The data taken after every "dtr_log" trajectories are collected in a structure + +typedef struct +{ + int nt,iac; + double dH,avpl; +} dat_t; + +with elements + +nt trajectory number, + +dH MD hamiltonian deficit at the end of the trajectory, + +iac 0 or 1 depending on whether the trajectory was accepted + or not, + +avpl average plaquette of the current gauge field. + +The average plaquette is equal to + + plaq_wsum_dble(1)/npl, + + npl=6*(N0-1)*N1*N2*N3 for open boundary conditions, + + =6*N0*N1*N2*N3 otherwise, + +where N0=NPROC0*L0, etc., are the lattice sizes (see uflds/plaq_sum.c). In the +course of the simulation, the collected data are written in binary form to the +*.dat file in a contiguous manner and without any header data at the beginning +of the file. They are also printed to the log file together with the average +solver iteration numbers and some further information. + +A simple main program that reads and analyses the run data files is included +in the directory ../devel/nompi/main. + + +MEASUREMENT DATA + +Unless the -noms option is set, the program performs measurements of a set of +observables based on the Wilson flow after every period of "dtr_ms" MD +trajectories. No measurements are performed in the thermalization phase (i.e. +at trajectory numbers less than "nth"). + +Each time a measurement is made, the Wilson flow is integrated from flow time +0 to time "nstep"*"eps" in steps of eps using the specified integrator. After +every "dnms" integration steps, the time-slice sums of the densities of the +Wilson plaquette action, the Yang-Mills action and the topological charge are +computed (see uflds/plaq_sum.c, tcharge/ftensor.c and tcharge/tcharge.c). + +At the beginning of the measurement data file the program writes the data +contained in the header structure + +static struct +{ + int dn,nn,tmax; + double eps; +} file_head; + +where dn="dnms", nn="nstep"/"dnms" and tmax=NPROC0*L0. After the header data, +the data file contains a sequence of data structures + +static struct +{ + int nt; + double **Wsl,**Ysl,**Qsl; +} data; + +labeled by the molecular-dynamics trajectory number nt where the measurement +was made. In each case the time-slice sums of the densities of the Wilson +plaquette action, the Yang-Mills action and the topological charge are written +to the arrays + + Wsl[in][t] (in=0,..,nn, t=0,..,tmax-1) + Ysl[in][t] + Qsl[in][t] + +See the functions write_file_head() and write_data() in the program file +qcd1.c for the exact order in which the data are written to the output files. + + +BINARY FILE FORMAT + +The log files are ASCII files that should be readable on any machine. +Configuration files and the data files, on the other hand, are written in +binary format using the fwrite() function. Integers are written as 4 byte +signed integers and floating-point numbers according to the IEEE-754 standard +for double-precision numbers. + +In the case of the exported configurations, the *.par and the *.dat files, and +if the machine is big endian, the data are converted to little endian byte +order before they are written to disk (see archive/archive.c and the functions +write_dat(), read_dat(), write_file_head() and write_data() defined in the +qcd1.c file). + + +RANDOM NUMBER GENERATOR + +Random numbers are generated using the ranlux generator. Depending on the +context, either single- or double-precision random numbers are generated. The +initialization of the generator is as follows: + +- In the case of a new run, the program reads the parameters "level" and + "seed" from the input file and uses these to initialize the generator. + +- Continuation runs starting from an imported field configuration read + the state of the generator from the configuration files. + +- Continuation runs starting from an exported field configuration do + the following: + + o If the option -norng is set, the parameters "level" and "seed" are read + from the input parameter file and the generator is initialized using + "seed"^n (bitwise exclusive or) as the seed value, where n is the number + of the last field configuration saved in the previous run. + + o Otherwise the state of the generator is read from the file .rng. + The generator is thus reset to the state it had at the end of the previous + run. Note that the process grid NPROC0x..xNPROC3 must be unchanged in this + case from one run to the next (an error occurs if it is not). + +In a sequence of continuation runs, it is therefore recommended to leave the +process grid unchanged and to make no use of the option -norng. If the process +grid is changed at some point, the next run must start from an exported field +configuration and the option -norng must be set. In all cases, the parameters +"level" and "seed" on the input parameter file may be left unchanged. + + +SAFETY MEASURES AND ERROR REPORTING + +A number of safety measures have been implemented: + +- It is not possible to overwrite an existing *.log or *.dat file; these + must first be deleted or renamed by hand if a run is to be repeated. + +- Appending a run to a previous run, but not from the last saved + configuration of that run, is not possible. + +- The accessibility of the various directories and the compatibility + of the selected options is checked at the beginning of the program. + +Any attempt to force illegal operations leads to an abnormal termination of +the program, with an informative message being written either to the *.log +file or the file STARTUP_ERROR in the program directory (if the error occurs +before the log file is opened). + +On the other hand, the following should be kept in mind: + +- Filenames may not be longer than 127 characters. The program + checks at an early stage whether this is the case or not. Longer + filenames can be accommodated by setting the macro NAME_SIZE in + the global.h header file to a larger value. + +- Once a run started successfully, the configurations generated + are saved unconditionally, i.e. any existing field configurations + with matching filenames are overwritten. + + +CHECKPOINTS AND EARLY TERMINATION + +The program can be stopped gracefully by touching a file in the log directory +with the same name as the log file but with extension .end instead of .log. It +may take a while until the program exits, because it will only do so at the +points where the gauge field configuration is saved to disk. + +If the machine crashes, or if the program was stopped in the way described, +the run can always be continued starting from the saved configuration and +output files. However, after a crash, the *.log and *.dat files may be +corrupted, in which case they must first be restored from the backup *.log~ +and *.dat~ files. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ym1 b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ym1 new file mode 100644 index 0000000000000000000000000000000000000000..0e409441908b4b3cbec87d80c0b31b1ba0164025 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ym1 @@ -0,0 +1,439 @@ + +Main program ym1 + +SYNTAX + +ym1 -i [-noloc] [-noexp] [-rmold] [-noms] + [-c [-a [-norng]]] + + +DESCRIPTION + +This program generates an ensemble of gauge fields representative of the +(pure) SU(3) gauge theory. Exactly which theory is simulated depends on the +parameters passed to the program. Moreover, one has a choice of boundary +conditions in time (open, SF, open-SF and periodic). The simulation is +based on a version of the HMC algorithm, which can be tuned in many ways via +the input parameters. + +In the course of the simulation, the average plaquette is measured +and the generated gauge field configurations are written out to files in +specified directories. Further observables, constructed using the Wilson flow, +are measured as well. + + +COMMAND-LINE OPTIONS + +The program has only few options since most of the parameters are passed +through an input file. The options are + +-i Specifies the name of the input file. The name can be + a fully qualified name or be specified relative to the + working directory. + +-noloc Normally the generated configurations are saved to the + local disks on the nodes of the machine. In addition they + are exported from process 0 using the export function (see + archive/archive.c). Initial configurations can be imported + or be read from the local disks. This option specifies that + the local disks should not be used. + +-noexp Do not export the generated field configurations. + +-rmold Remove old configurations and keep only the one which + was last saved to disk. The initial configuration + given on the command line is not removed unless the + -a option is set. + +-noms Do not measure any observables except for the average + plaquette. + +-c This option instructs the program to read the initial + gauge-field configuration from the specified file. The + file must be in one of the configuration directories + listed in the input file and its name must be of the form + described below. The run starts from a random gauge-field + configuration if this option is omitted. + +-a This option specifies that the run is a continuation of a + previous run. The -c option is required in this + case and must point to the last configuration saved by + the previous run. All output data are appended to the + previous output files. + +-norng Continuation runs normally start from the saved state + of the random number generators. This option specifies + that the traditional initialization of the generators is + to be used (see section RANDOM NUMBER GENERATOR below). + NOTE: starting from the saved state is not possible if + the process grid sizes NPROC0,..,NPROC3 are changed. + +The concurrent use of the options -noloc and -noexp (in which case the +generated configurations would not be saved anywhere) is considered to be an +error and is not permitted. In a sequence of continuation runs, the -noms +option must be set in either all or none of the runs. + + +INPUT PARAMETERS + +The lattice size and the process grid must be defined in the file global.h +(see README.global). All other parameters are read from the input file and the +command line. An example of a valid input file is ym1.in in this directory. +The parameter values specified in this file are: + +[Run name] +name Snoopy137 # Run name = configuration base name + +[Directories] +log_dir ../data/ym1/log # Log file directory +dat_dir ../data/ym1/dat # Data file directory +loc_dir /ndata/ym1/cnfg # Local configuration directory +cnfg_dir /data/ym1/cnfg # Exported configuration directory + +[Lattice parameters] +beta 6.00 # Inverse gauge coupling +c0 1.6667 # Coefficient of the plaquette term + # in the gauge action +[Boundary conditions] +type 2 # Type of boundary condition (0: open, + # 1: SF, 2: open-SF, 3: periodic) +phi 0.12 -0.56 # Boundary values of the gauge field at + # time 0 +phi' 0.92 0.76 # Boundary values of the gauge field at + # time NPROC0*L0 +cG 1.10 # Gauge action improvement coefficient at + # time 0 +cG' 1.05 # Gauge action improvement coefficient at + # time NPROC0*L0 + +[Random number generator] +level 0 # Ranlux level +seed 73099 # Ranlux seed + +[Trajectory length] +tau 3.0 # Molecular-dynamics (MD) trajectory length + +[MD integrator] +integrator OMF4 # LPFR: leapfrog, OMF2: 2nd order OMF, + # OMF4: 4th order OMF +lambda 0.19 # Parameter of the OMF2 integrator +nstep 16 # Number of integration steps per trajectory + +[MD trajectories] +nth 320 # Number of thermalization trajectories +ntr 32000 # Total number of trajectories +dtr_log 4 # Separation of log entries +dtr_ms 8 # Separation of measurements +dtr_cnfg 32 # Separation of configuration saves + +[Wilson flow] +integrator RK3 # EULER: Euler, RK2: 2nd order Runge-Kutta + # RK3: 3rd order Runge-Kutta +eps 2.0e-2 # Integration step size +nstep 100 # Total number of integration steps +dnms 10 # Number of steps between measurements + +The chosen parameter values must satisfy the following constraints: + +- "nth" and "ntr" must be integer multiples of "dtr_cnfg". + +- "nth" must be equal to zero in a continuation run (option -a). + +- "dtr_cnfg" must be a multiple of "dtr_log". + +- "dtr_cnfg" must be a multiple of "dtr_ms" and the latter must be + a multiple of "dtr_log". + +- The number "nstep" of Wilson flow integration steps must be a multiple + of "dnms". + +Depending on the specified options, the values of some parameters are ignored. +In particular, + +- "loc_dir" is not used if the -noloc option is set. + +- "cnfg_dir" is not used if -noexp is set and if the starting + configuration is not of the exported configuration type. + +- "lambda" is only required if the 2nd order OMF integrator is used. + +- The section "Wilson flow" and the parameter "dtr_ms" can be omitted + if the -noms option is set. + +Superfluous sections and parameters may be deleted or commented out. If +present they are not read by the program and have no effect on the run. In +particular, the constraints mentioned above involving these parameters need +not be satisfied. + + +INITIAL FIELD CONFIGURATION + +The initial field configuration specified on the command line with the -c +option can be in imported or exported form (see archive/archive.c). In the +case of imported configurations, each MPI process reads a file of the form + + _ + +where is the process number. On the command line, imported and exported +configurations are distinguished by an asterix (*) like + + * Imported configuration + + Exported configuration + +where it goes without saying that the string must not contain an +asterix at its end. + +Configurations in imported form are read from the directory loc_dir on the +local disks of the machine. The sizes of the current lattice and those read +from the files must be the same in this case. + +If the configuration is in exported form, it is read from the directory +cnfg_dir on a disk accessible from process 0. The sizes of the current lattice +need not be the same as those read from the configuration file, but must be +integer multiples of the latter. The field is periodically extended if the +lattice sizes do not match (see archive/archive.c for further explanations). + + +FILES + +The program stores the results to a number of files with the following file +names: + + .log Log file + .par Parameter file + .dat Data file + .ms.dat Measurement data file + .rng Exported state of the random number generators + + .log~ Backup log file + .par~ Backup parameter file + .dat~ Backup data file + .ms.dat~ Backup measurement data file + .rng~ Backup random number generator state file + + n3_0 Imported configuration file written by process 0 + n3_1 Imported configuration file written by process 1 + n3_2 Imported configuration file written by process 2 + ..... ..... + + n3 Exported configuration file + +Here n3 identifies configuration number 3. The directories in which these +files are stored are the ones specified in the input file. + +The directories "log_dir", "dat_dir" and "cnfg_dir" must be accessible from +process 0, while each process must be able to access the directory "loc_dir" +(unless the option -noloc is set). The "loc_dir" directory seen from different +processes may or may not be physically the same. + +Configurations are saved after the first "nth" trajectories and then after +every "dtr_cnfg" trajectories. The backup copies *.log~, *.dat~ and *.rng~ of +the *.log, *.dat and *.rng files are created each time a new configuration is +saved to disk. + +The parameter file *.par is created when a new run is started. It contains all +relevant lattice and run parameters in binary form. Continuation runs read the +file and check whether the parameter values match those read from the input +file. If a mismatch is discovered, the program is halted and an error message +is printed to the file STARTUP_ERROR in the program directory. + + +EXAMPLES + +The command + + ym1 -i ym1.in -c * + +starts a new run from the specified configuration which is searched for +in the "loc_dir" directory on the local disks of the machine. If instead the +run should be a continuation run, starting from the last configuration of a +previous run, the command would be + + ym1 -i ym1.in -c n3* -a + +In this case the *.log, *.par, *.dat and *.rng files of the previous run must +be found in the directories "log_dir" and "dat_dir", respectively. Using these +files, and the configuration name given on the command line, a number of +checks are performed to ensure that the run is indeed a continuation of the +previous one. + +In these two examples, the configuration filenames could also be and +n3 (i.e. without a "*") in which case the program assumes that the +configuration is an exported one. The configuration is then searched for in +the directory "cnfg_dir" by process 0 only. If the -c option is omitted, the +gauge field variables are set to uniformly distributed random SU(3) matrices. + + +RUN DATA + +The data taken after every "dtr_log" trajectories are collected in a structure + +typedef struct +{ + int nt,iac; + double dH,avpl; +} dat_t; + +with elements + +nt trajectory number, + +dH MD hamiltonian deficit at the end of the trajectory, + +iac 0 or 1 depending on whether the trajectory was accepted + or not, + +avpl average plaquette of the current gauge field. + +The average plaquette is equal to + + plaq_wsum_dble(1)/npl, + + npl=6*(N0-1)*N1*N2*N3 for open boundary conditions, + + =6*N0*N1*N2*N3 otherwise, + +where N0=NPROC0*L0, etc., are the lattice sizes (see uflds/plaq_sum.c). In the +course of the simulation, the collected data are written in binary form to the +*.dat file in a contiguous manner and without any header data at the beginning +of the file. They are also printed to the log file together with the average +solver iteration numbers and some further information. + +A simple main program that reads and analyses the run data files is included +in the directory ../devel/nompi/main. + + +MEASUREMENT DATA + +Unless the -noms option is set, the program performs measurements of a set of +observables based on the Wilson flow after every period of "dtr_ms" MD +trajectories. No measurements are performed in the thermalization phase (i.e. +at trajectory numbers less than "nth"). + +Each time a measurement is made, the Wilson flow is integrated from flow time +0 to time "nstep"*"eps" in steps of eps using the specified integrator. After +every "dnms" integration steps, the time-slice sums of the densities of the +Wilson plaquette action, the Yang-Mills action and the topological charge are +computed (see uflds/plaq_sum.c, tcharge/ftensor.c and tcharge/tcharge.c). + +At the beginning of the measurement data file the program writes the data +contained in the header structure + +static struct +{ + int dn,nn,tmax; + double eps; +} file_head; + +where dn="dnms", nn="nstep"/"dnms" and tmax=NPROC0*L0. After the header data, +the data file contains a sequence of data structures + +static struct +{ + int nt; + double **Wsl,**Ysl,**Qsl; +} data; + +labeled by the molecular-dynamics trajectory number nt where the measurement +was made. In each case the time-slice sums of the densities of the Wilson +plaquette action, the Yang-Mills action and the topological charge are written +to the arrays + + Wsl[in][t] (in=0,..,nn, t=0,..,tmax-1) + Ysl[in][t] + Qsl[in][t] + +See the functions write_file_head() and write_data() in the program file +ym1.c for the exact order in which the data are written to the output files. + + +BINARY FILE FORMAT + +The *.log files are ASCII files that should be readable on any machine. +Configuration files and the *.dat files, on the other hand, are written in +binary format using the fwrite() function. Integers are written as 4 byte +signed integers and floating-point numbers according to the IEEE-754 standard +for double-precision numbers. + +In the case of the exported configurations, the *.par and the *.dat files, and +if the machine is big endian, the data are converted to little endian byte +order before they are written to disk (see archive/archive.c and the functions +write_dat(), read_dat(), write_file_head() and write_data() defined in the +ym1.c file). + + +RANDOM NUMBER GENERATOR + +Random numbers are generated using the ranlux generator. Depending on the +context, either single- or double-precision random numbers are generated. The +initialization of the generator is as follows: + +- In the case of a new run, the program reads the parameters "level" and + "seed" from the input file and uses these to initialize the generator. + +- Continuation runs starting from an imported field configuration read + the state of the generator from the configuration files. + +- Continuation runs starting from an exported field configuration do + the following: + + o If the option -norng is set, the parameters "level" and "seed" are read + from the input parameter file and the generator is initialized using + "seed"^n (bitwise exclusive or) as the seed value, where n is the number + of the last field configuration saved in the previous run. + + o Otherwise the state of the generator is read from the file .rng. + The generator is thus reset to the state it had at the end of the previous + run. Note that the process grid NPROC0x..xNPROC3 must be unchanged in this + case from one run to the next (an error occurs if it is not). + +In a sequence of continuation runs, it is therefore recommended to leave the +process grid unchanged and to make no use of the option -norng. If the process +grid is changed at some point, the next run must start from an exported field +configuration and the option -norng must be set. In all cases, the parameters +"level" and "seed" on the input parameter file may be left unchanged. + + +SAFETY MEASURES AND ERROR REPORTING + +A number of safety measures have been implemented: + +- It is not possible to overwrite an existing *.log or *.dat file; these + must first be deleted or renamed by hand if a run is to be repeated. + +- Appending a run to a previous run, but not from the last saved + configuration of that run, is not possible. + +- The accessibility of the various directories and the compatibility + of the selected options is checked at the beginning of the program. + +Any attempt to force illegal operations leads to an abnormal termination of +the program, with an informative message being written either to the *.log +file or the file STARTUP_ERROR in the program directory (if the error occurs +before the log file is opened). + +On the other hand, the following should be kept in mind: + +- Filenames may not be longer than 127 characters. The program + checks at an early stage whether this is the case or not. Longer + filenames can be accommodated by setting the macro NAME_SIZE in + the global.h header file to a larger value. + +- Once a run started successfully, the configurations generated + are saved unconditionally, i.e. any existing field configurations + with matching filenames are overwritten. + + +CHECKPOINTS AND EARLY TERMINATION + +The program can be stopped gracefully by touching a file in the log directory +with the same name as the log file but with extension .end instead of .log. It +may take a while until the program exits, because it will only do so at the +points where the gauge field configuration is saved to disk. + +If the machine crashes, or if the program was stopped in the way described, +the run can always be continued starting from the saved configuration and +output files. However, after a crash, the *.log and *.dat files may be +corrupted, in which case they must first be restored from the backup *.log~ +and *.dat~ files. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/README new file mode 100644 index 0000000000000000000000000000000000000000..d291cd7673bcb616900db0b64f8c9251a2d9ca95 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/README @@ -0,0 +1,11 @@ + +******************************************************************************** + + Examples of input parameter files + +******************************************************************************** + +Some of the input parameter files included in these directories have been used +in actual simulation and measurement runs. Note, however, that simulations +require thermalization in the course of which the parameters may have to be +chosen differently. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/48x24v1.ms1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/48x24v1.ms1.in new file mode 100644 index 0000000000000000000000000000000000000000..ad826547dce830260280ecb6aaaef9f3f0cd3432 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/48x24v1.ms1.in @@ -0,0 +1,69 @@ + +################################################################################ +# +# Two-flavour QCD with open boundary conditions, twisted-mass reweighting of +# the first kind and decomposition of the reweighting factor in 2 factors. +# +################################################################################ + +[Run name] +name 48x24v1 + +[Directories] +log_dir /data/openQCD/ms1/log +dat_dir /data/openQCD/ms1/dat +loc_dir /ndata/openQCD/cnfg +cnfg_dir /data/openQCD/cnfg + +[Configurations] +first 100 +last 150 +step 1 +nrw 1 + +[Random number generator] +level 0 +seed 79232 + +[Lattice parameters] +kappa 0.13625 +csw 1.90952 + +[Boundary conditions] +type 0 +cF 1.0 + +[Reweighting factor 0] +rwfact RWTM1 +im0 0 +mu 0.001 0.003 +isp 0 +nsrc 24 + +[Solver 0] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-11 + +[SAP] +bs 4 6 6 4 + +[Deflation subspace] +bs 4 6 6 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13635 +mu 0.001 +ninv 10 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 16 +nmx 128 +res 1.0e-2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/48x24v2.ms1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/48x24v2.ms1.in new file mode 100644 index 0000000000000000000000000000000000000000..e1f99b2d1388ba68c569e6171ec3652c39122def --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/48x24v2.ms1.in @@ -0,0 +1,68 @@ + +################################################################################ +# +# Two-flavour QCD with periodic boundary conditions, twisted-mass reweighting +# of the second kind and decomposition of the reweighting factor in 3 factors. +# +################################################################################ + +[Run name] +name 48x24v2 + +[Directories] +log_dir /data/openQCD/ms1/log +dat_dir /data/openQCD/ms1/dat +loc_dir /ndata/openQCD/cnfg +cnfg_dir /data/openQCD/cnfg + +[Configurations] +first 10 +last 90 +step 2 +nrw 1 + +[Random number generator] +level 0 +seed 78711 + +[Lattice parameters] +kappa 0.13635 +csw 1.90952 + +[Boundary conditions] +type 3 + +[Reweighting factor 0] +rwfact RWTM2 +im0 0 +mu 0.0005 0.001 0.003 +isp 0 +nsrc 32 + +[Solver 0] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-11 + +[SAP] +bs 4 6 6 4 + +[Deflation subspace] +bs 4 6 6 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13635 +mu 0.001 +ninv 10 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 16 +nmx 128 +res 1.0e-2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/48x24v3.ms1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/48x24v3.ms1.in new file mode 100644 index 0000000000000000000000000000000000000000..817d57db1e035f9a0db4773723d40f8838c90fd9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/48x24v3.ms1.in @@ -0,0 +1,72 @@ + +################################################################################ +# +# Two-flavour QCD with SF boundary conditions, twisted-mass reweighting of +# the second kind, even-odd preconditioning and no decomposition of the +# reweighting factor. +# +################################################################################ + +[Run name] +name 48x24v3 + +[Directories] +log_dir /data/openQCD/ms1/log +dat_dir /data/openQCD/ms1/dat +loc_dir /ndata/openQCD/cnfg +cnfg_dir /data/openQCD/cnfg + +[Configurations] +first 126 +last 225 +step 1 +nrw 1 + +[Random number generator] +level 0 +seed 887056 + +[Lattice parameters] +kappa 0.13635 +csw 1.90952 + +[Boundary conditions] +type 1 +phi 0.5 -0.25 +phi' 0.0 0.0 +cF 1.0 + +[Reweighting factor 0] +rwfact RWTM2_EO +im0 0 +mu 0.0045 +isp 0 +nsrc 24 + +[Solver 0] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-11 + +[SAP] +bs 4 6 6 4 + +[Deflation subspace] +bs 4 6 6 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13635 +mu 0.001 +ninv 10 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 16 +nmx 128 +res 1.0e-2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/64x32v1.ms1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/64x32v1.ms1.in new file mode 100644 index 0000000000000000000000000000000000000000..8b127a4868ae2df35e3aa2318313119ccfd55ff8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/64x32v1.ms1.in @@ -0,0 +1,91 @@ + +################################################################################ +# +# 2+1 flavour QCD, mixed boundary conditions, second kind of light-quark +# twisted-mass reweighting, even-odd preconditioning and decomposition of the +# light-quark reweighting factor in 2 factors. +# +################################################################################ + +[Run name] +name 64x32v1 + +[Directories] +log_dir /data/openQCD/ms1/log +dat_dir /data/openQCD/ms1/dat +loc_dir /ndata/openQCD/cnfg +cnfg_dir /data/openQCD/cnfg + +[Configurations] +first 30 +last 100 +step 1 +nrw 2 + +[Random number generator] +level 0 +seed 126819 + +[Lattice parameters] +kappa 0.13774 0.1366 +csw 1.715 + +[Boundary conditions] +type 2 +phi' 0.0 0.0 +cG 1.0 +cG' 1.0 +cF 1.0 +cF' 1.0 + +[Reweighting factor 0] +rwfact RWTM2_EO +im0 0 +mu 0.001 0.002 +isp 0 +nsrc 24 + +[Reweighting factor 1] +rwfact RWRAT +im0 1 +irp 0 +np 6 3 +isp 1 0 +nsrc 1 + +[Rational 0] +degree 9 +range 0.03 6.1 + +[Solver 0] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 256 +res 1.0e-11 + +[Solver 1] +solver MSCG +nmx 2048 +res 1.0e-11 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13774 +mu 0.005 +ninv 10 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 24 +nmx 128 +res 1.0e-2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/64x32v2.ms1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/64x32v2.ms1.in new file mode 100644 index 0000000000000000000000000000000000000000..d127eddc7073ea5482ceeebd5eb251d2a97dd586 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/64x32v2.ms1.in @@ -0,0 +1,88 @@ + +################################################################################ +# +# 2+1 flavour QCD "at the physical point", open boundary conditions, even-odd +# preconditioning, light-quark twisted-mass reweighting of the second kind and +# decomposition of the reweighting factor in 2 factors. +# +################################################################################ + +[Run name] +name 64x32v2 + +[Directories] +log_dir /data/openQCD/ms1/log +dat_dir /data/openQCD/ms1/dat +loc_dir /ndata/openQCD/cnfg +cnfg_dir /data/openQCD/cnfg + +[Configurations] +first 35 +last 52 +step 1 +nrw 2 + +[Random number generator] +level 0 +seed 193392 + +[Lattice parameters] +kappa 0.137796 0.136634 +csw 1.715000 + +[Boundary conditions] +type 0 +cG 1.0 +cF 1.0 + +[Reweighting factor 0] +rwfact RWTM2_EO +im0 0 +mu 0.0005 0.0012 +isp 0 +nsrc 24 + +[Reweighting factor 1] +rwfact RWRAT +im0 1 +irp 0 +np 6 3 +isp 1 0 +nsrc 4 + +[Rational 0] +degree 9 +range 0.030 6.10 + +[Solver 0] +solver DFL_SAP_GCR +nkv 32 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-11 + +[Solver 1] +solver MSCG +nmx 2048 +res 1.0e-11 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13770 +mu 0.001 +ninv 10 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 24 +nmx 128 +res 1.0e-2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..7c3ec962ba381fd43fe264869219fd49ad253909 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/INDEX @@ -0,0 +1,10 @@ + +******************************************************************************** + + Input parameter files for the program ms1 + +******************************************************************************** + +The file names correspond to those of the simulation input parameter files in +the directory examples/qcd1. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/48x24v1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/48x24v1.in new file mode 100644 index 0000000000000000000000000000000000000000..8a9a0fdee7a72687c598d567f1096502beaee26e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/48x24v1.in @@ -0,0 +1,166 @@ + +################################################################################ +# +# Two-flavour QCD with Wilson plaquette action, open boundary conditions and +# twisted-mass reweighting of the first kind. +# +################################################################################ + +[Run name] +name 48x24v1 + +[Directories] +log_dir /data/openQCD/qcd1/log +dat_dir /data/openQCD/qcd1/dat +loc_dir /ndata/openQCD/cnfg +cnfg_dir /data/openQCD/cnfg + +[Lattice parameters] +beta 5.3 +c0 1.0 +kappa 0.13625 +csw 1.90952 + +[Boundary conditions] +type 0 +cG 1.0 +cF 1.0 + +[Random number generator] +level 0 +seed 787412 + +[HMC parameters] +actions 0 1 2 3 4 +npf 4 +mu 0.003 0.01 0.1 1.0 +nlv 2 +tau 2.0 + +[MD trajectories] +nth 0 +ntr 5000 +dtr_log 1 +dtr_ms 8 +dtr_cnfg 8 + +[Level 0] +integrator OMF4 +nstep 1 +forces 0 + +[Level 1] +integrator OMF4 +nstep 10 +forces 1 2 3 4 + +[Action 0] +action ACG + +[Action 1] +action ACF_TM1 +ipf 0 +im0 0 +imu 3 +isp 0 + +[Action 2] +action ACF_TM2 +ipf 1 +im0 0 +imu 2 3 +isp 1 0 + +[Action 3] +action ACF_TM2 +ipf 2 +im0 0 +imu 1 2 +isp 1 1 + +[Action 4] +action ACF_TM2 +ipf 3 +im0 0 +imu 0 1 +isp 1 1 + +[Force 0] +force FRG + +[Force 1] +force FRF_TM1 +isp 2 +ncr 3 + +[Force 2] +force FRF_TM2 +isp 3 +ncr 3 + +[Force 3] +force FRF_TM2 +isp 3 +ncr 3 + +[Force 4] +force FRF_TM2 +isp 3 +ncr 3 + +[Solver 0] +solver CGNE +nmx 512 +res 1.0e-11 + +[Solver 1] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-11 + +[Solver 2] +solver CGNE +nmx 512 +res 1.0e-10 + +[Solver 3] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-10 + +[SAP] +bs 4 6 6 4 + +[Deflation subspace] +bs 4 6 6 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13635 +mu 0.001 +ninv 9 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 16 +nmx 128 +res 1.0e-2 + +[Deflation update scheme] +dtau 0.09 +nsm 1 + +[Wilson flow] +integrator RK3 +eps 2.0e-2 +nstep 400 +dnms 2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/48x24v2.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/48x24v2.in new file mode 100644 index 0000000000000000000000000000000000000000..e75ade620e243b2ba2fe1855ae5149605c72187d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/48x24v2.in @@ -0,0 +1,181 @@ + +################################################################################ +# +# Two-flavour QCD with Wilson plaquette action, periodic boundary conditions +# and twisted-mass reweighting of the second kind. +# +################################################################################ + +[Run name] +name 48x24v2 + +[Directories] +log_dir /data/openQCD/qcd1/log +dat_dir /data/openQCD/qcd1/dat +loc_dir /ndata/openQCD/cnfg +cnfg_dir /data/openQCD/cnfg + +[Lattice parameters] +beta 5.3 +c0 1.0 +kappa 0.13635 +csw 1.90952 + +[Boundary conditions] +type 3 + +[Random number generator] +level 0 +seed 807721 + +[HMC parameters] +actions 0 1 2 3 4 5 +npf 5 +mu 3.0e-3 4.2426406871192851e-3 0.01 0.1 1.0 +nlv 3 +tau 2.0 + +[MD trajectories] +nth 0 +ntr 5000 +dtr_log 1 +dtr_ms 8 +dtr_cnfg 8 + +[Level 0] +integrator OMF4 +nstep 1 +forces 0 + +[Level 1] +integrator OMF4 +nstep 1 +forces 1 2 3 4 + +[Level 2] +integrator LPFR +nstep 10 +forces 5 + +[Action 0] +action ACG + +[Action 1] +action ACF_TM1 +ipf 0 +im0 0 +imu 4 +isp 0 + +[Action 2] +action ACF_TM2 +ipf 1 +im0 0 +imu 3 4 +isp 1 0 + +[Action 3] +action ACF_TM2 +ipf 2 +im0 0 +imu 2 3 +isp 1 1 + +[Action 4] +action ACF_TM2 +ipf 3 +im0 0 +imu 0 2 +isp 1 1 + +[Action 5] +action ACF_TM2 +ipf 4 +im0 0 +imu 0 1 +isp 1 1 + +[Force 0] +force FRG + +[Force 1] +force FRF_TM1 +isp 2 +ncr 3 + +[Force 2] +force FRF_TM2 +isp 3 +ncr 3 + +[Force 3] +force FRF_TM2 +isp 3 +ncr 3 + +[Force 4] +force FRF_TM2 +isp 3 +ncr 3 + +[Force 5] +force FRF_TM2 +isp 3 +ncr 1 + +[Solver 0] +solver CGNE +nmx 512 +res 1.0e-11 + +[Solver 1] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-11 + +[Solver 2] +solver CGNE +nmx 512 +res 1.0e-10 + +[Solver 3] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-10 + +[SAP] +bs 4 6 6 4 + +[Deflation subspace] +bs 4 6 6 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13635 +mu 0.001 +ninv 9 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 16 +nmx 128 +res 1.0e-2 + +[Deflation update scheme] +dtau 0.09 +nsm 1 + +[Wilson flow] +integrator RK3 +eps 2.0e-2 +nstep 400 +dnms 2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/48x24v3.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/48x24v3.in new file mode 100644 index 0000000000000000000000000000000000000000..db211390ca2e65188b599670161c3e0260cbac99 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/48x24v3.in @@ -0,0 +1,185 @@ + +################################################################################ +# +# Two-flavour QCD with Wilson plaquette action, SF boundary conditions, +# twisted-mass reweighting of the second kind and even-odd preconditioning. +# +################################################################################ + +[Run name] +name 48x24v3 + +[Directories] +log_dir /data/openQCD/qcd1/log +dat_dir /data/openQCD/qcd1/dat +loc_dir /ndata/openQCD/cnfg +cnfg_dir /data/openQCD/cnfg + +[Lattice parameters] +beta 5.3 +c0 1.0 +kappa 0.13635 +csw 1.90952 + +[Boundary conditions] +type 1 +phi 0.5 -0.25 +phi' 0.0 0.0 +cG 1.0 +cF 1.0 + +[Random number generator] +level 0 +seed 695959 + +[HMC parameters] +actions 0 1 2 3 4 5 +npf 5 +mu 4.5e-3 6.363961030678928e-3 0.01 0.1 1.0 +nlv 3 +tau 2.0 + +[MD trajectories] +nth 0 +ntr 5000 +dtr_log 1 +dtr_ms 8 +dtr_cnfg 8 + +[Level 0] +integrator OMF4 +nstep 1 +forces 0 + +[Level 1] +integrator OMF4 +nstep 1 +forces 1 2 3 4 + +[Level 2] +integrator LPFR +nstep 10 +forces 5 + +[Action 0] +action ACG + +[Action 1] +action ACF_TM1_EO_SDET +ipf 0 +im0 0 +imu 4 +isp 0 + +[Action 2] +action ACF_TM2_EO +ipf 1 +im0 0 +imu 3 4 +isp 1 0 + +[Action 3] +action ACF_TM2_EO +ipf 2 +im0 0 +imu 2 3 +isp 1 1 + +[Action 4] +action ACF_TM2_EO +ipf 3 +im0 0 +imu 0 2 +isp 1 1 + +[Action 5] +action ACF_TM2_EO +ipf 4 +im0 0 +imu 0 1 +isp 1 1 + +[Force 0] +force FRG + +[Force 1] +force FRF_TM1_EO_SDET +isp 2 +ncr 3 + +[Force 2] +force FRF_TM2_EO +isp 3 +ncr 3 + +[Force 3] +force FRF_TM2_EO +isp 3 +ncr 3 + +[Force 4] +force FRF_TM2_EO +isp 3 +ncr 3 + +[Force 5] +force FRF_TM2_EO +isp 3 +ncr 1 + +[Solver 0] +solver CGNE +nmx 512 +res 1.0e-11 + +[Solver 1] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-11 + +[Solver 2] +solver CGNE +nmx 512 +res 1.0e-10 + +[Solver 3] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-10 + +[SAP] +bs 4 6 6 4 + +[Deflation subspace] +bs 4 6 6 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13635 +mu 0.001 +ninv 10 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 16 +nmx 128 +res 1.0e-2 + +[Deflation update scheme] +dtau 0.09 +nsm 1 + +[Wilson flow] +integrator RK3 +eps 2.0e-2 +nstep 400 +dnms 2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/64x32v1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/64x32v1.in new file mode 100644 index 0000000000000000000000000000000000000000..5c3a555e3c82847364c46409676d54477303f6a9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/64x32v1.in @@ -0,0 +1,233 @@ + +################################################################################ +# +# 2+1 flavour QCD with Iwasaki action, mixed boundary conditions, second kind +# of light-quark twisted-mass reweighting and even-odd preconditioning. +# +################################################################################ + +[Run name] +name 64x32v1 + +[Directories] +log_dir /data/openQCD/qcd1/log +dat_dir /data/openQCD/qcd1/dat +loc_dir /ndata/openQCD/cnfg +cnfg_dir /data/openQCD/cnfg + +[Lattice parameters] +beta 1.9 +c0 3.648 +kappa 0.13774 0.1366 +csw 1.715 + +[Boundary conditions] +type 2 +phi' 0.0 0.0 +cG 1.0 +cG' 1.0 +cF 1.0 +cF' 1.0 + +[Random number generator] +level 0 +seed 8641 + +[HMC parameters] +actions 0 1 2 3 4 5 6 7 8 +npf 8 +mu 0.002 0.002828427124746190 0.05 0.5 +nlv 3 +tau 1.2 + +[MD trajectories] +nth 0 +ntr 2400 +dtr_log 1 +dtr_ms 8 +dtr_cnfg 8 + +[Level 0] +integrator OMF4 +nstep 1 +forces 0 + +[Level 1] +integrator OMF4 +nstep 1 +forces 1 2 3 5 6 + +[Level 2] +integrator OMF2 +lambda 0.1666667 +nstep 4 +forces 4 7 8 + +[Rational 0] +degree 9 +range 0.03 6.1 + +[Action 0] +action ACG + +[Action 1] +action ACF_TM1_EO_SDET +ipf 0 +im0 0 +imu 3 +isp 0 + +[Action 2] +action ACF_TM2_EO +ipf 1 +im0 0 +imu 2 3 +isp 1 0 + +[Action 3] +action ACF_TM2_EO +ipf 2 +im0 0 +imu 0 2 +isp 1 1 + +[Action 4] +action ACF_TM2_EO +ipf 3 +im0 0 +imu 0 1 +isp 1 1 + +[Action 5] +action ACF_RAT_SDET +ipf 4 +im0 1 +irat 0 0 5 +isp 4 + +[Action 6] +action ACF_RAT +ipf 5 +im0 1 +irat 0 6 6 +isp 1 + +[Action 7] +action ACF_RAT +ipf 6 +im0 1 +irat 0 7 7 +isp 1 + +[Action 8] +action ACF_RAT +ipf 7 +im0 1 +irat 0 8 8 +isp 1 + +[Force 0] +force FRG + +[Force 1] +force FRF_TM1_EO_SDET +isp 2 +ncr 4 + +[Force 2] +force FRF_TM2_EO +isp 3 +ncr 3 + +[Force 3] +force FRF_TM2_EO +isp 3 +ncr 3 + +[Force 4] +force FRF_TM2_EO +isp 3 +ncr 1 + +[Force 5] +force FRF_RAT_SDET +isp 5 + +[Force 6] +force FRF_RAT +isp 3 + +[Force 7] +force FRF_RAT +isp 3 + +[Force 8] +force FRF_RAT +isp 3 + +[Solver 0] +solver CGNE +nmx 1024 +res 1.0e-11 + +[Solver 1] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-11 + +[Solver 2] +solver CGNE +nmx 1024 +res 1.0e-10 + +[Solver 3] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-10 + +[Solver 4] +solver MSCG +nmx 1024 +res 1.e-11 + +[Solver 5] +solver MSCG +nmx 1024 +res 1.e-10 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13774 +mu 0.001 +ninv 10 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 24 +nmx 128 +res 1.0e-2 + +[Deflation update scheme] +dtau 0.05 +nsm 1 + +[Wilson flow] +integrator RK3 +eps 1.0e-2 +nstep 600 +dnms 10 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/64x32v2.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/64x32v2.in new file mode 100644 index 0000000000000000000000000000000000000000..1be8ea98b2d82d47a8352a19a26f2eb842e61b40 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/64x32v2.in @@ -0,0 +1,231 @@ + +################################################################################ +# +# 2+1 flavour QCD "at the physical point" with Iwasaki action, open boundary +# conditions, second kind of light-quark twisted-mass reweighting and even-odd +# preconditioning. +# +################################################################################ + +[Run name] +name 64x32v2 + +[Directories] +log_dir /data/openQCD/qcd1/log +dat_dir /data/openQCD/qcd1/dat +loc_dir /ndata/openQCD/cnfg +cnfg_dir /data/openQCD/cnfg + +[Lattice parameters] +beta 1.9 +c0 3.648 +kappa 0.137796 0.136634 +csw 1.715 + +[Boundary conditions] +type 0 +cG 1.0 +cF 1.0 + +[Random number generator] +level 0 +seed 1026 + +[HMC parameters] +actions 0 1 2 3 4 5 6 7 8 +npf 8 +mu 0.0012 0.001697056274847714 0.05 0.5 +nlv 3 +tau 1.1 + +[MD trajectories] +nth 0 +ntr 8000 +dtr_log 1 +dtr_ms 4 +dtr_cnfg 4 + +[Level 0] +integrator OMF4 +nstep 1 +forces 0 + +[Level 1] +integrator OMF4 +nstep 1 +forces 1 2 3 5 6 + +[Level 2] +integrator OMF2 +lambda 0.1666667 +nstep 6 +forces 4 7 8 + +[Rational 0] +degree 9 +range 0.03 6.1 + +[Action 0] +action ACG + +[Action 1] +action ACF_TM1_EO_SDET +ipf 0 +im0 0 +imu 3 +isp 0 + +[Action 2] +action ACF_TM2_EO +ipf 1 +im0 0 +imu 2 3 +isp 1 0 + +[Action 3] +action ACF_TM2_EO +ipf 2 +im0 0 +imu 0 2 +isp 1 1 + +[Action 4] +action ACF_TM2_EO +ipf 3 +im0 0 +imu 0 1 +isp 1 1 + +[Action 5] +action ACF_RAT_SDET +ipf 4 +im0 1 +irat 0 0 5 +isp 4 + +[Action 6] +action ACF_RAT +ipf 5 +im0 1 +irat 0 6 6 +isp 1 + +[Action 7] +action ACF_RAT +ipf 6 +im0 1 +irat 0 7 7 +isp 1 + +[Action 8] +action ACF_RAT +ipf 7 +im0 1 +irat 0 8 8 +isp 1 + +[Force 0] +force FRG + +[Force 1] +force FRF_TM1_EO_SDET +isp 2 +ncr 4 + +[Force 2] +force FRF_TM2_EO +isp 3 +ncr 3 + +[Force 3] +force FRF_TM2_EO +isp 3 +ncr 3 + +[Force 4] +force FRF_TM2_EO +isp 3 +ncr 1 + +[Force 5] +force FRF_RAT_SDET +isp 5 + +[Force 6] +force FRF_RAT +isp 3 + +[Force 7] +force FRF_RAT +isp 3 + +[Force 8] +force FRF_RAT +isp 3 + +[Solver 0] +solver CGNE +nmx 1024 +res 1.0e-11 + +[Solver 1] +solver DFL_SAP_GCR +nkv 24 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-11 + +[Solver 2] +solver CGNE +nmx 1024 +res 1.0e-10 + +[Solver 3] +solver DFL_SAP_GCR +nkv 24 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-10 + +[Solver 4] +solver MSCG +nmx 1024 +res 1.e-11 + +[Solver 5] +solver MSCG +nmx 1024 +res 1.e-10 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13770 +mu 0.001 +ninv 9 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 24 +nmx 128 +res 1.0e-2 + +[Deflation update scheme] +dtau 0.037 +nsm 1 + +[Wilson flow] +integrator RK3 +eps 1.0e-2 +nstep 600 +dnms 10 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..6f5cdf02a669c0a9e3c820232cff45fd19dfd37e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/INDEX @@ -0,0 +1,27 @@ + +******************************************************************************** + + Input parameter files for the program qcd1 + +******************************************************************************** + +48x24v1.in Two-flavour QCD with Wilson plaquette action, open + boundary conditions and twisted-mass reweighting of + the first kind. + +48x24v2.in Two-flavour QCD with Wilson plaquette action, periodic + boundary conditions and twisted-mass reweighting of the + second kind. + +48x24v3.in Two-flavour QCD with Wilson plaquette action, SF + boundary conditions, twisted-mass reweighting of the + second kind and even-odd preconditioning. + +64x32v1.in 2+1 flavour QCD with Iwasaki action, mixed boundary + conditions, second kind of light-quark twisted-mass + reweighting and even-odd preconditioning. + +64x32v2.in 2+1 flavour QCD "at the physical point" with Iwasaki + action, open boundary conditions, second kind of + light-quark twisted-mass reweighting and even-odd + preconditioning. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ms1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ms1.c new file mode 100644 index 0000000000000000000000000000000000000000..8b85a5457df2400d1c73f18aa9190ce8d7fa3665 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ms1.c @@ -0,0 +1,1482 @@ + +/******************************************************************************* +* +* File ms1.c +* +* Copyright (C) 2012-2014 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Stochastic estimation of reweighting factors. +* +* Syntax: ms1 -i [-noexp] [-a [-norng]] +* +* For usage instructions see the file README.ms1. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "archive.h" +#include "dfl.h" +#include "update.h" +#include "version.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +#define MAX(n,m) \ + if ((n)<(m)) \ + (n)=(m) + +static struct +{ + int nrw; + int *nfct,*nsrc; +} file_head; + +static struct +{ + int nc; + double ***sqn,***lnr; +} data; + +static int my_rank,noexp,append,norng,endian; +static int first,last,step,level,seed; +static int ipgrd[2],**rwstat=NULL,*rlxs_state=NULL,*rlxd_state=NULL; + +static char line[NAME_SIZE]; +static char log_dir[NAME_SIZE],dat_dir[NAME_SIZE]; +static char loc_dir[NAME_SIZE],cnfg_dir[NAME_SIZE]; +static char log_file[NAME_SIZE],log_save[NAME_SIZE],end_file[NAME_SIZE]; +static char par_file[NAME_SIZE],par_save[NAME_SIZE]; +static char dat_file[NAME_SIZE],dat_save[NAME_SIZE]; +static char rng_file[NAME_SIZE],rng_save[NAME_SIZE]; +static char cnfg_file[NAME_SIZE],nbase[NAME_SIZE]; +static FILE *fin=NULL,*flog=NULL,*fdat=NULL,*fend=NULL; + +static lat_parms_t lat; +static bc_parms_t bcp; + + +static void alloc_data(void) +{ + int nrw,*nfct,*nsrc; + int i,irw,ifct,n1,n2,n3; + double ***ppp,**pp,*p; + + nrw=file_head.nrw; + nfct=file_head.nfct; + nsrc=file_head.nsrc; + n1=nrw; + n2=0; + n3=0; + + for (irw=0;irw=NAME_SIZE, + 1,"setup_files [ms1.c]","loc_dir name is too long"); + else + error_root(name_size("%s/%sn%d",cnfg_dir,nbase,last)>=NAME_SIZE, + 1,"setup_files [ms1.c]","cnfg_dir name is too long"); + + check_dir_root(log_dir); + check_dir_root(dat_dir); + error_root(name_size("%s/%s.ms1.log~",log_dir,nbase)>=NAME_SIZE, + 1,"setup_files [ms1.c]","log_dir name is too long"); + error_root(name_size("%s/%s.ms1.dat~",dat_dir,nbase)>=NAME_SIZE, + 1,"setup_files [ms1.c]","dat_dir name is too long"); + + sprintf(log_file,"%s/%s.ms1.log",log_dir,nbase); + sprintf(par_file,"%s/%s.ms1.par",dat_dir,nbase); + sprintf(dat_file,"%s/%s.ms1.dat",dat_dir,nbase); + sprintf(rng_file,"%s/%s.ms1.rng",dat_dir,nbase); + sprintf(end_file,"%s/%s.ms1.end",log_dir,nbase); + sprintf(log_save,"%s~",log_file); + sprintf(par_save,"%s~",par_file); + sprintf(dat_save,"%s~",dat_file); + sprintf(rng_save,"%s~",rng_file); +} + + +static void read_lat_parms(void) +{ + int nk; + double csw,*kappa; + + if (my_rank==0) + { + find_section("Lattice parameters"); + nk=count_tokens("kappa"); + error_root(nk<1,1,"read_lat_parms [ms1.c]", + "Missing hopping parameter values"); + read_line("csw","%lf",&csw); + } + + MPI_Bcast(&nk,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&csw,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + kappa=malloc(nk*sizeof(*kappa)); + error(kappa==NULL,1,"read_lat_parms [check2.c]", + "Unable to allocate parameter array"); + if (my_rank==0) + read_dprms("kappa",nk,kappa); + MPI_Bcast(kappa,nk,MPI_DOUBLE,0,MPI_COMM_WORLD); + + lat=set_lat_parms(0.0,1.0,nk,kappa,csw); + free(kappa); + + if (append) + check_lat_parms(fdat); + else + write_lat_parms(fdat); +} + + +static void read_bc_parms(void) +{ + int bc; + double cF,cF_prime; + double phi[2],phi_prime[2]; + + if (my_rank==0) + { + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + cF=1.0; + cF_prime=1.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + if (bc!=3) + read_line("cF","%lf",&cF); + + if (bc==2) + read_line("cF'","%lf",&cF_prime); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + bcp=set_bc_parms(bc,1.0,1.0,cF,cF_prime,phi,phi_prime); + + if (append) + check_bc_parms(fdat); + else + write_bc_parms(fdat); +} + + +static void read_rw_factors(void) +{ + int nrw,*nfct,*nsrc,irw,irp; + rw_parms_t rwp; + rat_parms_t rp; + + nrw=file_head.nrw; + nfct=file_head.nfct; + nsrc=file_head.nsrc; + + for (irw=0;irw [-noexp] [-a [-norng]]"); + + error_root(endian==UNKNOWN_ENDIAN,1,"read_infile [ms1.c]", + "Machine has unknown endianness"); + + noexp=find_opt(argc,argv,"-noexp"); + append=find_opt(argc,argv,"-a"); + norng=find_opt(argc,argv,"-norng"); + + fin=freopen(argv[ifile+1],"r",stdin); + error_root(fin==NULL,1,"read_infile [ms1.c]", + "Unable to open input file"); + } + + MPI_Bcast(&endian,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&noexp,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&append,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&norng,1,MPI_INT,0,MPI_COMM_WORLD); + + read_dirs(); + setup_files(); + + if (my_rank==0) + { + if (append) + fdat=fopen(par_file,"rb"); + else + fdat=fopen(par_file,"wb"); + + error_root(fdat==NULL,1,"read_infile [ms1.c]", + "Unable to open parameter file"); + } + + if (my_rank==0) + { + find_section("Random number generator"); + read_line("level","%d",&level); + read_line("seed","%d",&seed); + } + + MPI_Bcast(&level,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&seed,1,MPI_INT,0,MPI_COMM_WORLD); + + read_lat_parms(); + read_bc_parms(); + read_rw_factors(); + read_solvers(); + + if (my_rank==0) + { + fclose(fin); + fclose(fdat); + + if (append==0) + copy_file(par_file,par_save); + } +} + + +static void check_old_log(int *fst,int *lst,int *stp) +{ + int ie,ic,isv; + int fc,lc,dc,pc; + int np[4],bp[4]; + + fend=fopen(log_file,"r"); + error_root(fend==NULL,1,"check_old_log [ms1.c]", + "Unable to open log file"); + + fc=0; + lc=0; + dc=0; + pc=0; + + ie=0x0; + ic=0; + isv=0; + + while (fgets(line,NAME_SIZE,fend)!=NULL) + { + if (strstr(line,"process grid")!=NULL) + { + if (sscanf(line,"%dx%dx%dx%d process grid, %dx%dx%dx%d", + np,np+1,np+2,np+3,bp,bp+1,bp+2,bp+3)==8) + { + ipgrd[0]=((np[0]!=NPROC0)||(np[1]!=NPROC1)|| + (np[2]!=NPROC2)||(np[3]!=NPROC3)); + ipgrd[1]=((bp[0]!=NPROC0_BLK)||(bp[1]!=NPROC1_BLK)|| + (bp[2]!=NPROC2_BLK)||(bp[3]!=NPROC3_BLK)); + } + else + ie|=0x1; + } + else if (strstr(line,"fully processed")!=NULL) + { + pc=lc; + + if (sscanf(line,"Configuration no %d",&lc)==1) + { + ic+=1; + isv=1; + } + else + ie|=0x1; + + if (ic==1) + fc=lc; + else if (ic==2) + dc=lc-fc; + else if ((ic>2)&&(lc!=(pc+dc))) + ie|=0x2; + } + else if (strstr(line,"Configuration no")!=NULL) + isv=0; + } + + fclose(fend); + + error_root((ie&0x1)!=0x0,1,"check_old_log [ms1.c]", + "Incorrect read count"); + error_root((ie&0x2)!=0x0,1,"check_old_log [ms1.c]", + "Configuration numbers are not equally spaced"); + error_root(isv==0,1,"check_old_log [ms1.c]", + "Log file extends beyond the last configuration save"); + + (*fst)=fc; + (*lst)=lc; + (*stp)=dc; +} + + +static void check_old_dat(int fst,int lst,int stp) +{ + int ie,ic; + int fc,lc,dc,pc; + + fdat=fopen(dat_file,"rb"); + error_root(fdat==NULL,1,"check_old_dat [ms1.c]", + "Unable to open data file"); + + check_file_head(); + + fc=0; + lc=0; + dc=0; + pc=0; + + ie=0x0; + ic=0; + + while (read_data()==1) + { + pc=lc; + lc=data.nc; + ic+=1; + + if (ic==1) + fc=lc; + else if (ic==2) + dc=lc-fc; + else if ((ic>2)&&(lc!=(pc+dc))) + ie|=0x1; + } + + fclose(fdat); + + error_root(ic==0,1,"check_old_dat [ms1.c]", + "No data records found"); + error_root((ie&0x1)!=0x0,1,"check_old_dat [ms1.c]", + "Configuration numbers are not equally spaced"); + error_root((fst!=fc)||(lst!=lc)||(stp!=dc),1,"check_old_dat [ms1.c]", + "Configuration range is not as reported in the log file"); +} + + +static void check_files(void) +{ + int fst,lst,stp; + + ipgrd[0]=0; + ipgrd[1]=0; + + if (my_rank==0) + { + if (append) + { + check_old_log(&fst,&lst,&stp); + check_old_dat(fst,lst,stp); + + error_root((fst!=lst)&&(stp!=step),1,"check_files [ms1.c]", + "Continuation run:\n" + "Previous run had a different configuration separation"); + error_root(first!=lst+step,1,"check_files [ms1.c]", + "Continuation run:\n" + "Configuration range does not continue the previous one"); + } + else + { + fin=fopen(log_file,"r"); + fdat=fopen(dat_file,"rb"); + + error_root((fin!=NULL)||(fdat!=NULL),1,"check_files [ms1.c]", + "Attempt to overwrite old *.log or *.dat file"); + + fdat=fopen(dat_file,"wb"); + error_root(fdat==NULL,1,"check_files [ms1.c]", + "Unable to open data file"); + write_file_head(); + fclose(fdat); + } + } +} + + +static void print_info(void) +{ + int isap,idfl,ik,n[3]; + long ip; + + if (my_rank==0) + { + ip=ftell(flog); + fclose(flog); + + if (ip==0L) + remove("STARTUP_ERROR"); + + if (append) + flog=freopen(log_file,"a",stdout); + else + flog=freopen(log_file,"w",stdout); + + error_root(flog==NULL,1,"print_info [ms1.c]","Unable to open log file"); + printf("\n"); + + if (append) + printf("Continuation run\n\n"); + else + { + printf("Measurement of reweighting factors\n"); + printf("----------------------------------\n\n"); + } + + printf("Program version %s\n",openQCD_RELEASE); + + if (endian==LITTLE_ENDIAN) + printf("The machine is little endian\n"); + else + printf("The machine is big endian\n"); + if (noexp) + printf("Configurations are read in imported file format\n\n"); + else + printf("Configurations are read in exported file format\n\n"); + + if ((ipgrd[0]!=0)&&(ipgrd[1]!=0)) + printf("Process grid and process block size changed:\n"); + else if (ipgrd[0]!=0) + printf("Process grid changed:\n"); + else if (ipgrd[1]!=0) + printf("Process block size changed:\n"); + + if ((append==0)||(ipgrd[0]!=0)||(ipgrd[1]!=0)) + { + printf("%dx%dx%dx%d lattice, ",N0,N1,N2,N3); + printf("%dx%dx%dx%d local lattice\n",L0,L1,L2,L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d process block size\n\n", + NPROC0_BLK,NPROC1_BLK,NPROC2_BLK,NPROC3_BLK); + } + + if (append) + { + printf("Random number generator:\n"); + + if (norng) + printf("level = %d, seed = %d, effective seed = %d\n\n", + level,seed,seed^(first-step)); + else + { + printf("State of ranlxs and ranlxd reset to the\n"); + printf("last exported state\n\n"); + } + } + else + { + printf("Random number generator:\n"); + printf("level = %d, seed = %d\n\n",level,seed); + + printf("Lattice parameters:\n"); + + for (ik=0;ik=11) + printf("kappa[%2d] = %.*f\n",ik,IMAX(n[0],6),lat.kappa[ik]); + else + printf("kappa[%1d] = %.*f\n",ik,IMAX(n[0],6),lat.kappa[ik]); + } + + n[0]=fdigits(lat.csw); + printf("csw = %.*f\n\n",IMAX(n[0],1),lat.csw); + + if (bcp.type==0) + { + printf("Open boundary conditions\n"); + + n[0]=fdigits(bcp.cF[0]); + printf("cF = %.*f\n\n",IMAX(n[0],1),bcp.cF[0]); + } + else if (bcp.type==1) + { + printf("SF boundary conditions\n"); + + n[0]=fdigits(bcp.cF[0]); + printf("cF = %.*f\n",IMAX(n[0],1),bcp.cF[0]); + + n[0]=fdigits(bcp.phi[0][0]); + n[1]=fdigits(bcp.phi[0][1]); + n[2]=fdigits(bcp.phi[0][2]); + printf("phi = %.*f,%.*f,%.*f\n",IMAX(n[0],1),bcp.phi[0][0], + IMAX(n[1],1),bcp.phi[0][1],IMAX(n[2],1),bcp.phi[0][2]); + + n[0]=fdigits(bcp.phi[1][0]); + n[1]=fdigits(bcp.phi[1][1]); + n[2]=fdigits(bcp.phi[1][2]); + printf("phi' = %.*f,%.*f,%.*f\n\n",IMAX(n[0],1),bcp.phi[1][0], + IMAX(n[1],1),bcp.phi[1][1],IMAX(n[2],1),bcp.phi[1][2]); + } + else if (bcp.type==2) + { + printf("Open-SF boundary conditions\n"); + + n[0]=fdigits(bcp.cF[0]); + printf("cF = %.*f\n",IMAX(n[0],1),bcp.cF[0]); + n[1]=fdigits(bcp.cF[1]); + printf("cF' = %.*f\n",IMAX(n[1],1),bcp.cF[1]); + + n[0]=fdigits(bcp.phi[1][0]); + n[1]=fdigits(bcp.phi[1][1]); + n[2]=fdigits(bcp.phi[1][2]); + printf("phi' = %.*f,%.*f,%.*f\n\n",IMAX(n[0],1),bcp.phi[1][0], + IMAX(n[1],1),bcp.phi[1][1],IMAX(n[2],1),bcp.phi[1][2]); + } + else + printf("Periodic boundary conditions\n\n"); + + print_rw_parms(); + print_rat_parms(); + print_solver_parms(&isap,&idfl); + + if (isap) + print_sap_parms(0); + + if (idfl) + print_dfl_parms(0); + } + + printf("Configurations no %d -> %d in steps of %d\n\n", + first,last,step); + fflush(flog); + } +} + + +static void dfl_wsize(int *nws,int *nwv,int *nwvd) +{ + dfl_parms_t dp; + dfl_pro_parms_t dpp; + + dp=dfl_parms(); + dpp=dfl_pro_parms(); + + MAX(*nws,dp.Ns+2); + MAX(*nwv,2*dpp.nkv+2); + MAX(*nwvd,4); +} + + +static void solver_wsize(int isp,int nsd,int np, + int *nws,int *nwsd,int *nwv,int *nwvd) +{ + solver_parms_t sp; + + sp=solver_parms(isp); + + if (sp.solver==CGNE) + { + MAX(*nws,5); + MAX(*nwsd,nsd+3); + } + else if (sp.solver==MSCG) + { + if (np>1) + { + MAX(*nwsd,nsd+np+3); + } + else + { + MAX(*nwsd,nsd+5); + } + } + else if (sp.solver==SAP_GCR) + { + MAX(*nws,2*sp.nkv+1); + MAX(*nwsd,nsd+2); + } + else if (sp.solver==DFL_SAP_GCR) + { + MAX(*nws,2*sp.nkv+2); + MAX(*nwsd,nsd+4); + dfl_wsize(nws,nwv,nwvd); + } +} + + +static void reweight_wsize(int *nws,int *nwsd,int *nwv,int *nwvd) +{ + int nrw,nfct; + int irw,ifct,nsd; + int *np,*isp; + rw_parms_t rwp; + solver_parms_t sp; + + (*nws)=0; + (*nwsd)=0; + (*nwv)=0; + (*nwvd)=0; + nrw=file_head.nrw; + + for (irw=0;irw0) + mu1=rwp.mu[ifct-1]; + else + mu1=0.0; + + mu2=rwp.mu[ifct]; + isp=rwp.isp[ifct]; + + for (isrc=0;isrc [-noexp] +* +* For usage instructions see the file README.ms2. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "archive.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "sap.h" +#include "dfl.h" +#include "ratfcts.h" +#include "forces.h" +#include "version.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +#define MAX(n,m) \ + if ((n)<(m)) \ + (n)=(m) + +static int my_rank,noexp,endian; +static int first,last,step,np_ra,np_rb; +static int *rlxs_state=NULL,*rlxd_state=NULL; +static double ar[256]; + +static char log_dir[NAME_SIZE],loc_dir[NAME_SIZE],cnfg_dir[NAME_SIZE]; +static char log_file[NAME_SIZE],log_save[NAME_SIZE],end_file[NAME_SIZE]; +static char cnfg_file[NAME_SIZE],nbase[NAME_SIZE]; +static FILE *fin=NULL,*flog=NULL,*fend=NULL; + +static lat_parms_t lat; +static bc_parms_t bcp; + + +static void read_dirs(void) +{ + if (my_rank==0) + { + find_section("Run name"); + read_line("name","%s",nbase); + + find_section("Directories"); + read_line("log_dir","%s",log_dir); + + if (noexp) + { + read_line("loc_dir","%s",loc_dir); + cnfg_dir[0]='\0'; + } + else + { + read_line("cnfg_dir","%s",cnfg_dir); + loc_dir[0]='\0'; + } + + find_section("Configurations"); + read_line("first","%d",&first); + read_line("last","%d",&last); + read_line("step","%d",&step); + + error_root((last=NAME_SIZE, + 1,"setup_files [ms2.c]","loc_dir name is too long"); + else + error_root(name_size("%s/%sn%d",cnfg_dir,nbase,last)>=NAME_SIZE, + 1,"setup_files [ms2.c]","cnfg_dir name is too long"); + + check_dir_root(log_dir); + error_root(name_size("%s/%s.ms2.log~",log_dir,nbase)>=NAME_SIZE, + 1,"setup_files [ms2.c]","log_dir name is too long"); + + sprintf(log_file,"%s/%s.ms2.log",log_dir,nbase); + sprintf(end_file,"%s/%s.ms2.end",log_dir,nbase); + sprintf(log_save,"%s~",log_file); +} + + +static void read_lat_parms(void) +{ + double kappa,csw; + + if (my_rank==0) + { + find_section("Dirac operator"); + read_line("kappa","%lf",&kappa); + read_line("csw","%lf",&csw); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&csw,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + lat=set_lat_parms(0.0,1.0,1,&kappa,csw); +} + + +static void read_bc_parms(void) +{ + int bc; + double cF,cF_prime; + double phi[2],phi_prime[2]; + + if (my_rank==0) + { + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + cF=1.0; + cF_prime=1.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + if (bc!=3) + read_line("cF","%lf",&cF); + + if (bc==2) + read_line("cF'","%lf",&cF_prime); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + bcp=set_bc_parms(bc,1.0,1.0,cF,cF_prime,phi,phi_prime); +} + + +static void read_sap_parms(void) +{ + int bs[4]; + + if (my_rank==0) + { + find_section("SAP"); + read_line("bs","%d %d %d %d",bs,bs+1,bs+2,bs+3); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + set_sap_parms(bs,1,4,5); +} + + +static void read_dfl_parms(void) +{ + int bs[4],Ns; + int ninv,nmr,ncy,nkv,nmx; + double kappa,mu,res; + + if (my_rank==0) + { + find_section("Deflation subspace"); + read_line("bs","%d %d %d %d",bs,bs+1,bs+2,bs+3); + read_line("Ns","%d",&Ns); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_parms(bs,Ns); + + if (my_rank==0) + { + find_section("Deflation subspace generation"); + read_line("kappa","%lf",&kappa); + read_line("mu","%lf",&mu); + read_line("ninv","%d",&ninv); + read_line("nmr","%d",&nmr); + read_line("ncy","%d",&ncy); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&ninv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_gen_parms(kappa,mu,ninv,nmr,ncy); + + if (my_rank==0) + { + find_section("Deflation projection"); + read_line("nkv","%d",&nkv); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + } + + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + set_dfl_pro_parms(nkv,nmx,res); +} + + +static void read_solver(void) +{ + solver_parms_t sp; + + read_solver_parms(0); + sp=solver_parms(0); + + if ((sp.solver==SAP_GCR)||(sp.solver==DFL_SAP_GCR)) + read_sap_parms(); + + if (sp.solver==DFL_SAP_GCR) + read_dfl_parms(); +} + + +static void read_infile(int argc,char *argv[]) +{ + int ifile; + + if (my_rank==0) + { + flog=freopen("STARTUP_ERROR","w",stdout); + + ifile=find_opt(argc,argv,"-i"); + endian=endianness(); + + error_root((ifile==0)||(ifile==(argc-1)),1,"read_infile [ms2.c]", + "Syntax: ms2 -i [-noexp]"); + + error_root(endian==UNKNOWN_ENDIAN,1,"read_infile [ms2.c]", + "Machine has unknown endianness"); + + noexp=find_opt(argc,argv,"-noexp"); + + fin=freopen(argv[ifile+1],"r",stdin); + error_root(fin==NULL,1,"read_infile [ms2.c]", + "Unable to open input file"); + } + + MPI_Bcast(&endian,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&noexp,1,MPI_INT,0,MPI_COMM_WORLD); + + read_dirs(); + setup_files(); + read_lat_parms(); + read_bc_parms(); + + if (my_rank==0) + { + find_section("Power method"); + read_line("np_ra","%d",&np_ra); + read_line("np_rb","%d",&np_rb); + error_root((np_ra<1)||(np_rb<1),1,"read_infile [ms2.c]", + "Power method iteration numbers must be at least 1"); + } + + MPI_Bcast(&np_ra,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&np_rb,1,MPI_INT,0,MPI_COMM_WORLD); + read_solver(); + + if (my_rank==0) + fclose(fin); +} + + +static void check_files(void) +{ + if (my_rank==0) + { + fin=fopen(log_file,"r"); + error_root(fin!=NULL,1,"check_files [ms2.c]", + "Attempt to overwrite old *.log file"); + } +} + + +static void print_info(void) +{ + int isap,idfl,n[3]; + long ip; + + if (my_rank==0) + { + ip=ftell(flog); + fclose(flog); + + if (ip==0L) + remove("STARTUP_ERROR"); + + flog=freopen(log_file,"w",stdout); + error_root(flog==NULL,1,"print_info [ms2.c]","Unable to open log file"); + printf("\n"); + + printf("Spectral range of the hermitian Dirac operator\n"); + printf("----------------------------------------------\n\n"); + + printf("Program version %s\n",openQCD_RELEASE); + + if (endian==LITTLE_ENDIAN) + printf("The machine is little endian\n"); + else + printf("The machine is big endian\n"); + if (noexp) + printf("Configurations are read in imported file format\n\n"); + else + printf("Configurations are read in exported file format\n\n"); + + printf("%dx%dx%dx%d lattice, ",N0,N1,N2,N3); + printf("%dx%dx%dx%d local lattice\n",L0,L1,L2,L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d process block size\n", + NPROC0_BLK,NPROC1_BLK,NPROC2_BLK,NPROC3_BLK); + printf("SF boundary conditions on the quark fields\n\n"); + + printf("Dirac operator:\n"); + n[0]=fdigits(lat.kappa[0]); + printf("kappa = %.*f\n",IMAX(n[0],6),lat.kappa[0]); + n[0]=fdigits(lat.csw); + printf("csw = %.*f\n\n",IMAX(n[0],1),lat.csw); + + if (bcp.type==0) + { + printf("Open boundary conditions\n"); + + n[0]=fdigits(bcp.cF[0]); + printf("cF = %.*f\n\n",IMAX(n[0],1),bcp.cF[0]); + } + else if (bcp.type==1) + { + printf("SF boundary conditions\n"); + + n[0]=fdigits(bcp.cF[0]); + printf("cF = %.*f\n",IMAX(n[0],1),bcp.cF[0]); + + n[0]=fdigits(bcp.phi[0][0]); + n[1]=fdigits(bcp.phi[0][1]); + n[2]=fdigits(bcp.phi[0][2]); + printf("phi = %.*f,%.*f,%.*f\n",IMAX(n[0],1),bcp.phi[0][0], + IMAX(n[1],1),bcp.phi[0][1],IMAX(n[2],1),bcp.phi[0][2]); + + n[0]=fdigits(bcp.phi[1][0]); + n[1]=fdigits(bcp.phi[1][1]); + n[2]=fdigits(bcp.phi[1][2]); + printf("phi' = %.*f,%.*f,%.*f\n\n",IMAX(n[0],1),bcp.phi[1][0], + IMAX(n[1],1),bcp.phi[1][1],IMAX(n[2],1),bcp.phi[1][2]); + } + else if (bcp.type==2) + { + printf("Open-SF boundary conditions\n"); + + n[0]=fdigits(bcp.cF[0]); + printf("cF = %.*f\n",IMAX(n[0],1),bcp.cF[0]); + n[1]=fdigits(bcp.cF[1]); + printf("cF' = %.*f\n",IMAX(n[1],1),bcp.cF[1]); + + n[0]=fdigits(bcp.phi[1][0]); + n[1]=fdigits(bcp.phi[1][1]); + n[2]=fdigits(bcp.phi[1][2]); + printf("phi' = %.*f,%.*f,%.*f\n\n",IMAX(n[0],1),bcp.phi[1][0], + IMAX(n[1],1),bcp.phi[1][1],IMAX(n[2],1),bcp.phi[1][2]); + } + else + printf("Periodic boundary conditions\n\n"); + + printf("Power method:\n"); + printf("np_ra = %d\n",np_ra); + printf("np_rb = %d\n\n",np_rb); + + print_solver_parms(&isap,&idfl); + + if (isap) + print_sap_parms(0); + + if (idfl) + print_dfl_parms(0); + + printf("Configurations no %d -> %d in steps of %d\n\n", + first,last,step); + fflush(flog); + } +} + + +static void dfl_wsize(int *nws,int *nwv,int *nwvd) +{ + dfl_parms_t dp; + dfl_pro_parms_t dpp; + + dp=dfl_parms(); + dpp=dfl_pro_parms(); + + MAX(*nws,dp.Ns+2); + MAX(*nwv,2*dpp.nkv+2); + MAX(*nwvd,4); +} + + +static void wsize(int *nws,int *nwsd,int *nwv,int *nwvd) +{ + int nsd; + solver_parms_t sp; + + (*nws)=0; + (*nwsd)=0; + (*nwv)=0; + (*nwvd)=0; + + sp=solver_parms(0); + + if (sp.solver==CGNE) + { + nsd=1; + MAX(*nws,5); + MAX(*nwsd,nsd+3); + } + else if (sp.solver==SAP_GCR) + { + nsd=2; + MAX(*nws,2*sp.nkv+1); + MAX(*nwsd,nsd+2); + } + else if (sp.solver==DFL_SAP_GCR) + { + nsd=2; + MAX(*nws,2*sp.nkv+2); + MAX(*nwsd,nsd+4); + dfl_wsize(nws,nwv,nwvd); + } + else + error_root(1,1,"wsize [ms2.c]", + "Unknown or unsupported solver"); +} + + +static double power1(int *status) +{ + int nsd,k,l,stat[6]; + double r; + spinor_dble **wsd; + solver_parms_t sp; + sap_parms_t sap; + + set_sw_parms(sea_quark_mass(0)); + sp=solver_parms(0); + + if (sp.solver==CGNE) + { + nsd=1; + status[0]=0; + } + else if (sp.solver==SAP_GCR) + { + nsd=2; + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + status[0]=0; + } + else if (sp.solver==DFL_SAP_GCR) + { + nsd=2; + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + for (l=0;l<3;l++) + status[l]=0; + } + else + { + nsd=1; + error_root(1,1,"power1 [ms2.c]", + "Unknown or unsupported solver"); + } + + wsd=reserve_wsd(nsd); + random_sd(VOLUME/2,wsd[0],1.0); + bnd_sd2zero(EVEN_PTS,wsd[0]); + r=normalize_dble(VOLUME/2,1,wsd[0]); + + for (k=0;kramax) + ramax=ra; + raavg+=ra; + + if (rbrbmax) + rbmax=rb; + rbavg+=rb; + } + + MPI_Barrier(MPI_COMM_WORLD); + wt2=MPI_Wtime(); + wtavg+=(wt2-wt1); + error_chk(); + + if (my_rank==0) + { + printf("ra = %.2e, rb = %.2e, ",ra,rb); + + if (dfl.Ns) + printf("status = %d,%d,%d\n", + status[0],status[1],status[2]); + else + printf("status = %d\n",status[0]); + + printf("Configuration no %d fully processed in %.2e sec ", + nc,wt2-wt1); + printf("(average = %.2e sec)\n\n", + wtavg/(double)((nc-first)/step+1)); + + fflush(flog); + copy_file(log_file,log_save); + } + + check_endflag(&iend); + } + + if (my_rank==0) + { + last=nc-step; + nc=(last-first)/step+1; + + printf("Summary\n"); + printf("-------\n\n"); + + printf("Considered %d configurations in the range %d -> %d\n\n", + nc,first,last); + + printf("The three figures quoted in each case are the minimal,\n"); + printf("maximal and average values\n\n"); + + printf("Spectral gap ra = %.2e, %.2e, %.2e\n", + ramin,ramax,raavg/(double)(nc)); + printf("Spectral radius rb = %.2e, %.2e, %.2e\n\n", + rbmin,rbmax,rbavg/(double)(nc)); + + ra=0.90*ramin; + rb=1.03*rbmax; + eps=ra/rb; + eps=eps*eps; + Ne=0.5*(double)(NPROC0*L0-2)*(double)(NPROC1*NPROC2*NPROC3*L1*L2*L3); + + printf("Zolotarev rational approximation:\n\n"); + + printf("n: number of poles\n"); + printf("delta: approximation error\n"); + printf("Ne: number of even lattice points\n"); + printf("Suggested spectral range = [%.2e,%.2e]\n\n",ra,rb); + + printf(" n delta 12*Ne*delta 12*Ne*delta^2\n"); + + for (n=6;n<=128;n++) + { + zolotarev(n,eps,&A,ar,&delta); + d1=12.0*Ne*delta; + d2=d1*delta; + + printf(" %3d %.1e %.1e %.1e\n",n,delta,d1,d2); + + if ((d1<1.0e-2)&&(d2<1.0e-4)) + break; + } + + printf("\n"); + } + + error_chk(); + + if (my_rank==0) + { + fflush(flog); + copy_file(log_file,log_save); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ms2.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ms2.in new file mode 100644 index 0000000000000000000000000000000000000000..d8b413890a4fffa7986bd381bcce408f4b01908f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ms2.in @@ -0,0 +1,56 @@ + +[Run name] +name Snoopy137 + +[Directories] +log_dir ../data/ms2/log +loc_dir /ndata/qcd1/cnfg +cnfg_dir /data/qcd1/cnfg + +[Configurations] +first 1 +last 4 +step 1 + +[Dirac operator] +kappa 0.1300 +csw 1.234 + +[Boundary conditions] +type 2 +phi 0.12 -0.56 +phi' 0.92 0.76 +cF 0.95 +cF' 0.90 + +[Power method] +np_ra 20 +np_rb 100 + +[Solver 0] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-8 + +[SAP] +bs 8 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13635 +mu 0.005 +ninv 10 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 24 +nmx 512 +res 1.0e-2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ms3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ms3.c new file mode 100644 index 0000000000000000000000000000000000000000..05ac9527f086c8490b0749eb3a123716bc0e95bf --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ms3.c @@ -0,0 +1,953 @@ + +/******************************************************************************* +* +* File ms3.c +* +* Copyright (C) 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Computation of Wilson flow observables. +* +* Syntax: ms3 -i [-noexp] [-a] +* +* For usage instructions see the file README.ms3. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "archive.h" +#include "tcharge.h" +#include "wflow.h" +#include "version.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static struct +{ + int dn,nn,tmax; + double eps; +} file_head; + +static struct +{ + int nc; + double **Wsl,**Ysl,**Qsl; +} data; + +static int my_rank,noexp,append,endian; +static int first,last,step; +static int ipgrd[2],flint; +static double *Wact,*Yact,*Qtop; + +static char line[NAME_SIZE]; +static char log_dir[NAME_SIZE],dat_dir[NAME_SIZE]; +static char loc_dir[NAME_SIZE],cnfg_dir[NAME_SIZE]; +static char log_file[NAME_SIZE],log_save[NAME_SIZE],end_file[NAME_SIZE]; +static char par_file[NAME_SIZE],par_save[NAME_SIZE]; +static char dat_file[NAME_SIZE],dat_save[NAME_SIZE]; +static char cnfg_file[NAME_SIZE],nbase[NAME_SIZE]; +static FILE *fin=NULL,*flog=NULL,*fdat=NULL,*fend=NULL; + +static bc_parms_t bcp; + + +static void alloc_data(void) +{ + int nn,tmax; + int in; + double **pp,*p; + + nn=file_head.nn; + tmax=file_head.tmax; + + pp=amalloc(3*(nn+1)*sizeof(*pp),3); + p=amalloc(3*(nn+1)*(tmax+1)*sizeof(*p),4); + + error((pp==NULL)||(p==NULL),1,"alloc_data [ms3.c]", + "Unable to allocate data arrays"); + + data.Wsl=pp; + data.Ysl=pp+nn+1; + data.Qsl=pp+2*(nn+1); + + for (in=0;in<(3*(nn+1));in++) + { + *pp=p; + pp+=1; + p+=tmax; + } + + Wact=p; + p+=nn+1; + Yact=p; + p+=nn+1; + Qtop=p; +} + + +static void write_file_head(void) +{ + int iw; + stdint_t istd[3]; + double dstd[1]; + + istd[0]=(stdint_t)(file_head.dn); + istd[1]=(stdint_t)(file_head.nn); + istd[2]=(stdint_t)(file_head.tmax); + dstd[0]=file_head.eps; + + if (endian==BIG_ENDIAN) + { + bswap_int(3,istd); + bswap_double(1,dstd); + } + + iw=fwrite(istd,sizeof(stdint_t),3,fdat); + iw+=fwrite(dstd,sizeof(double),1,fdat); + + error_root(iw!=4,1,"write_file_head [ms3.c]", + "Incorrect write count"); +} + + +static void check_file_head(void) +{ + int ir; + stdint_t istd[3]; + double dstd[1]; + + ir=fread(istd,sizeof(stdint_t),3,fdat); + ir+=fread(dstd,sizeof(double),1,fdat); + + error_root(ir!=4,1,"check_file_head [ms3.c]", + "Incorrect read count"); + + if (endian==BIG_ENDIAN) + { + bswap_int(3,istd); + bswap_double(1,dstd); + } + + error_root(((int)(istd[0])!=file_head.dn)|| + ((int)(istd[1])!=file_head.nn)|| + ((int)(istd[2])!=file_head.tmax)|| + (dstd[0]!=file_head.eps),1,"check_file_head [ms3.c]", + "Unexpected value of dn,nn,tmax or eps"); +} + + +static void write_data(void) +{ + int iw,nn,tmax; + int in,t; + stdint_t istd[1]; + double dstd[1]; + + istd[0]=(stdint_t)(data.nc); + + if (endian==BIG_ENDIAN) + bswap_int(1,istd); + + iw=fwrite(istd,sizeof(stdint_t),1,fdat); + + nn=file_head.nn; + tmax=file_head.tmax; + + for (in=0;in<=nn;in++) + { + for (t=0;t=NAME_SIZE, + 1,"setup_files [ms3.c]","loc_dir name is too long"); + else + error_root(name_size("%s/%sn%d",cnfg_dir,nbase,last)>=NAME_SIZE, + 1,"setup_files [ms3.c]","cnfg_dir name is too long"); + + check_dir_root(log_dir); + check_dir_root(dat_dir); + error_root(name_size("%s/%s.ms3.log~",log_dir,nbase)>=NAME_SIZE, + 1,"setup_files [ms3.c]","log_dir name is too long"); + error_root(name_size("%s/%s.ms3.dat~",dat_dir,nbase)>=NAME_SIZE, + 1,"setup_files [ms3.c]","dat_dir name is too long"); + + sprintf(log_file,"%s/%s.ms3.log",log_dir,nbase); + sprintf(par_file,"%s/%s.ms3.par",dat_dir,nbase); + sprintf(dat_file,"%s/%s.ms3.dat",dat_dir,nbase); + sprintf(end_file,"%s/%s.ms3.end",log_dir,nbase); + sprintf(log_save,"%s~",log_file); + sprintf(par_save,"%s~",par_file); + sprintf(dat_save,"%s~",dat_file); +} + + +static void read_bc_parms(void) +{ + int bc; + double phi[2],phi_prime[2]; + + if (my_rank==0) + { + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + bcp=set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + + if (append) + check_bc_parms(fdat); + else + write_bc_parms(fdat); +} + + +static void read_wflow_parms(void) +{ + int nstep,dnms,ie,ir,iw; + stdint_t istd[3]; + double eps,dstd[1]; + + if (my_rank==0) + { + find_section("Wilson flow"); + read_line("integrator","%s",line); + read_line("eps","%lf",&eps); + read_line("nstep","%d",&nstep); + read_line("dnms","%d",&dnms); + + if (strcmp(line,"EULER")==0) + flint=0; + else if (strcmp(line,"RK2")==0) + flint=1; + else if (strcmp(line,"RK3")==0) + flint=2; + else + error_root(1,1,"read_wflow_parms [ms3.c]","Unknown integrator"); + + error_root((dnms<1)||(nstep [-noexp] [-a]"); + + error_root(endian==UNKNOWN_ENDIAN,1,"read_infile [ms3.c]", + "Machine has unknown endianness"); + + noexp=find_opt(argc,argv,"-noexp"); + append=find_opt(argc,argv,"-a"); + + fin=freopen(argv[ifile+1],"r",stdin); + error_root(fin==NULL,1,"read_infile [ms3.c]", + "Unable to open input file"); + } + + MPI_Bcast(&endian,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&noexp,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&append,1,MPI_INT,0,MPI_COMM_WORLD); + + read_dirs(); + setup_files(); + + if (my_rank==0) + { + if (append) + fdat=fopen(par_file,"rb"); + else + fdat=fopen(par_file,"wb"); + + error_root(fdat==NULL,1,"read_infile [ms3.c]", + "Unable to open parameter file"); + } + + read_bc_parms(); + read_wflow_parms(); + + if (my_rank==0) + { + fclose(fin); + fclose(fdat); + + if (append==0) + copy_file(par_file,par_save); + } +} + + +static void check_old_log(int *fst,int *lst,int *stp) +{ + int ie,ic,isv; + int fc,lc,dc,pc; + int np[4],bp[4]; + + fend=fopen(log_file,"r"); + error_root(fend==NULL,1,"check_old_log [ms3.c]", + "Unable to open log file"); + + fc=0; + lc=0; + dc=0; + pc=0; + + ie=0x0; + ic=0; + isv=0; + + while (fgets(line,NAME_SIZE,fend)!=NULL) + { + if (strstr(line,"process grid")!=NULL) + { + if (sscanf(line,"%dx%dx%dx%d process grid, %dx%dx%dx%d", + np,np+1,np+2,np+3,bp,bp+1,bp+2,bp+3)==8) + { + ipgrd[0]=((np[0]!=NPROC0)||(np[1]!=NPROC1)|| + (np[2]!=NPROC2)||(np[3]!=NPROC3)); + ipgrd[1]=((bp[0]!=NPROC0_BLK)||(bp[1]!=NPROC1_BLK)|| + (bp[2]!=NPROC2_BLK)||(bp[3]!=NPROC3_BLK)); + } + else + ie|=0x1; + } + else if (strstr(line,"fully processed")!=NULL) + { + pc=lc; + + if (sscanf(line,"Configuration no %d",&lc)==1) + { + ic+=1; + isv=1; + } + else + ie|=0x1; + + if (ic==1) + fc=lc; + else if (ic==2) + dc=lc-fc; + else if ((ic>2)&&(lc!=(pc+dc))) + ie|=0x2; + } + else if (strstr(line,"Configuration no")!=NULL) + isv=0; + } + + fclose(fend); + + error_root((ie&0x1)!=0x0,1,"check_old_log [ms3.c]", + "Incorrect read count"); + error_root((ie&0x2)!=0x0,1,"check_old_log [ms3.c]", + "Configuration numbers are not equally spaced"); + error_root(isv==0,1,"check_old_log [ms3.c]", + "Log file extends beyond the last configuration save"); + + (*fst)=fc; + (*lst)=lc; + (*stp)=dc; +} + + +static void check_old_dat(int fst,int lst,int stp) +{ + int ie,ic; + int fc,lc,dc,pc; + + fdat=fopen(dat_file,"rb"); + error_root(fdat==NULL,1,"check_old_dat [ms3.c]", + "Unable to open data file"); + + check_file_head(); + + fc=0; + lc=0; + dc=0; + pc=0; + + ie=0x0; + ic=0; + + while (read_data()==1) + { + pc=lc; + lc=data.nc; + ic+=1; + + if (ic==1) + fc=lc; + else if (ic==2) + dc=lc-fc; + else if ((ic>2)&&(lc!=(pc+dc))) + ie|=0x1; + } + + fclose(fdat); + + error_root(ic==0,1,"check_old_dat [ms3.c]", + "No data records found"); + error_root((ie&0x1)!=0x0,1,"check_old_dat [ms3.c]", + "Configuration numbers are not equally spaced"); + error_root((fst!=fc)||(lst!=lc)||(stp!=dc),1,"check_old_dat [ms3.c]", + "Configuration range is not as reported in the log file"); +} + + +static void check_files(void) +{ + int fst,lst,stp; + + ipgrd[0]=0; + ipgrd[1]=0; + + if (my_rank==0) + { + if (append) + { + check_old_log(&fst,&lst,&stp); + check_old_dat(fst,lst,stp); + + error_root((fst!=lst)&&(stp!=step),1,"check_files [ms3.c]", + "Continuation run:\n" + "Previous run had a different configuration separation"); + error_root(first!=lst+step,1,"check_files [ms3.c]", + "Continuation run:\n" + "Configuration range does not continue the previous one"); + } + else + { + fin=fopen(log_file,"r"); + fdat=fopen(dat_file,"rb"); + + error_root((fin!=NULL)||(fdat!=NULL),1,"check_files [ms3.c]", + "Attempt to overwrite old *.log or *.dat file"); + + fdat=fopen(dat_file,"wb"); + error_root(fdat==NULL,1,"check_files [ms3.c]", + "Unable to open data file"); + write_file_head(); + fclose(fdat); + } + } +} + + +static void print_info(void) +{ + int n[3]; + long ip; + + if (my_rank==0) + { + ip=ftell(flog); + fclose(flog); + + if (ip==0L) + remove("STARTUP_ERROR"); + + if (append) + flog=freopen(log_file,"a",stdout); + else + flog=freopen(log_file,"w",stdout); + + error_root(flog==NULL,1,"print_info [ms3.c]","Unable to open log file"); + printf("\n"); + + if (append) + printf("Continuation run\n\n"); + else + { + printf("Computation of Wilson flow observables\n"); + printf("--------------------------------------\n\n"); + } + + printf("Program version %s\n",openQCD_RELEASE); + + if (endian==LITTLE_ENDIAN) + printf("The machine is little endian\n"); + else + printf("The machine is big endian\n"); + if (noexp) + printf("Configurations are read in imported file format\n\n"); + else + printf("Configurations are read in exported file format\n\n"); + + if ((ipgrd[0]!=0)&&(ipgrd[1]!=0)) + printf("Process grid and process block size changed:\n"); + else if (ipgrd[0]!=0) + printf("Process grid changed:\n"); + else if (ipgrd[1]!=0) + printf("Process block size changed:\n"); + + if ((append==0)||(ipgrd[0]!=0)||(ipgrd[1]!=0)) + { + printf("%dx%dx%dx%d lattice, ",N0,N1,N2,N3); + printf("%dx%dx%dx%d local lattice\n",L0,L1,L2,L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d process block size\n\n", + NPROC0_BLK,NPROC1_BLK,NPROC2_BLK,NPROC3_BLK); + } + + if (append==0) + { + if (bcp.type==0) + printf("Open boundary conditions\n\n"); + else if (bcp.type==1) + { + printf("SF boundary conditions\n"); + + n[0]=fdigits(bcp.phi[0][0]); + n[1]=fdigits(bcp.phi[0][1]); + n[2]=fdigits(bcp.phi[0][2]); + printf("phi = %.*f,%.*f,%.*f\n",IMAX(n[0],1),bcp.phi[0][0], + IMAX(n[1],1),bcp.phi[0][1],IMAX(n[2],1),bcp.phi[0][2]); + + n[0]=fdigits(bcp.phi[1][0]); + n[1]=fdigits(bcp.phi[1][1]); + n[2]=fdigits(bcp.phi[1][2]); + printf("phi' = %.*f,%.*f,%.*f\n\n",IMAX(n[0],1),bcp.phi[1][0], + IMAX(n[1],1),bcp.phi[1][1],IMAX(n[2],1),bcp.phi[1][2]); + } + else if (bcp.type==2) + { + printf("Open-SF boundary conditions\n"); + + n[0]=fdigits(bcp.phi[1][0]); + n[1]=fdigits(bcp.phi[1][1]); + n[2]=fdigits(bcp.phi[1][2]); + printf("phi' = %.*f,%.*f,%.*f\n\n",IMAX(n[0],1),bcp.phi[1][0], + IMAX(n[1],1),bcp.phi[1][1],IMAX(n[2],1),bcp.phi[1][2]); + } + else + printf("Periodic boundary conditions\n\n"); + + printf("Wilson flow:\n"); + if (flint==0) + printf("Euler integrator\n"); + else if (flint==1) + printf("2nd order RK integrator\n"); + else + printf("3rd order RK integrator\n"); + n[0]=fdigits(file_head.eps); + printf("eps = %.*f\n",IMAX(n[0],1),file_head.eps); + printf("nstep = %d\n",file_head.dn*file_head.nn); + printf("dnms = %d\n\n",file_head.dn); + } + + printf("Configurations no %d -> %d in steps of %d\n\n", + first,last,step); + fflush(flog); + } +} + + +static void set_data(int nc) +{ + int in,dn,nn; + double eps; + + data.nc=nc; + dn=file_head.dn; + nn=file_head.nn; + eps=file_head.eps; + + for (in=0;in [-noexp] +* +* For usage instructions see the file README.ms4. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "archive.h" +#include "sflds.h" +#include "linalg.h" +#include "dirac.h" +#include "sap.h" +#include "dfl.h" +#include "forces.h" +#include "version.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +#define MAX(n,m) \ + if ((n)<(m)) \ + (n)=(m) + +static int my_rank,noexp,endian; +static int first,last,step; +static int level,seed,x0,nsrc; +static int *rlxs_state=NULL,*rlxd_state=NULL; +static double mus; + +static char log_dir[NAME_SIZE],loc_dir[NAME_SIZE]; +static char cnfg_dir[NAME_SIZE],sfld_dir[NAME_SIZE]; +static char log_file[NAME_SIZE],log_save[NAME_SIZE],end_file[NAME_SIZE]; +static char cnfg_file[NAME_SIZE],sfld_file[NAME_SIZE],nbase[NAME_SIZE]; +static FILE *fin=NULL,*flog=NULL,*fend=NULL; + +static lat_parms_t lat; +static bc_parms_t bcp; + + +static void read_dirs(void) +{ + if (my_rank==0) + { + find_section("Run name"); + read_line("name","%s",nbase); + + find_section("Directories"); + read_line("log_dir","%s",log_dir); + + if (noexp) + { + read_line("loc_dir","%s",loc_dir); + cnfg_dir[0]='\0'; + } + else + { + read_line("cnfg_dir","%s",cnfg_dir); + loc_dir[0]='\0'; + } + + read_line("sfld_dir","%s",sfld_dir); + + find_section("Configurations"); + read_line("first","%d",&first); + read_line("last","%d",&last); + read_line("step","%d",&step); + + find_section("Random number generator"); + read_line("level","%d",&level); + read_line("seed","%d",&seed); + + error_root((last=NAME_SIZE, + 1,"setup_files [ms4.c]","loc_dir name is too long"); + else + error_root(name_size("%s/%sn%d",cnfg_dir,nbase,last)>=NAME_SIZE, + 1,"setup_files [ms4.c]","cnfg_dir name is too long"); + + check_dir_root(sfld_dir); + error_root(name_size("%s/%sn%d.s%d",sfld_dir,nbase,last,nsrc-1)>=NAME_SIZE, + 1,"setup_files [ms4.c]","sfld_dir name is too long"); + + check_dir_root(log_dir); + error_root(name_size("%s/%s.ms4.log~",log_dir,nbase)>=NAME_SIZE, + 1,"setup_files [ms4.c]","log_dir name is too long"); + + sprintf(log_file,"%s/%s.ms4.log",log_dir,nbase); + sprintf(end_file,"%s/%s.ms4.end",log_dir,nbase); + sprintf(log_save,"%s~",log_file); +} + + +static void read_lat_parms(void) +{ + double kappa,csw; + + if (my_rank==0) + { + find_section("Dirac operator"); + read_line("kappa","%lf",&kappa); + read_line("mu","%lf",&mus); + read_line("csw","%lf",&csw); + + find_section("Source fields"); + read_line("x0","%d",&x0); + read_line("nsrc","%d",&nsrc); + + error_root((x0<0)||(x0>=N0),1,"read_lat_parms [ms4.c]", + "Specified time x0 is out of range"); + error_root(nsrc<1,1,"read_lat_parms [ms4.c]", + "The number of source fields must be at least 1"); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mus,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&csw,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + MPI_Bcast(&x0,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nsrc,1,MPI_INT,0,MPI_COMM_WORLD); + + lat=set_lat_parms(0.0,1.0,1,&kappa,csw); + set_sw_parms(sea_quark_mass(0)); +} + + +static void read_bc_parms(void) +{ + int bc; + double cF,cF_prime; + double phi[2],phi_prime[2]; + + if (my_rank==0) + { + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + error_root(((x0==0)&&(bc!=3))||((x0=(N0-1))&&(bc==0)),1, + "read_bc_parms [ms4.c]","Incompatible choice of boundary " + "conditions and source time"); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + cF=1.0; + cF_prime=1.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + if (bc!=3) + read_line("cF","%lf",&cF); + + if (bc==2) + read_line("cF'","%lf",&cF_prime); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + bcp=set_bc_parms(bc,1.0,1.0,cF,cF_prime,phi,phi_prime); +} + + +static void read_sap_parms(void) +{ + int bs[4]; + + if (my_rank==0) + { + find_section("SAP"); + read_line("bs","%d %d %d %d",bs,bs+1,bs+2,bs+3); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + set_sap_parms(bs,1,4,5); +} + + +static void read_dfl_parms(void) +{ + int bs[4],Ns; + int ninv,nmr,ncy,nkv,nmx; + double kappa,mu,res; + + if (my_rank==0) + { + find_section("Deflation subspace"); + read_line("bs","%d %d %d %d",bs,bs+1,bs+2,bs+3); + read_line("Ns","%d",&Ns); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_parms(bs,Ns); + + if (my_rank==0) + { + find_section("Deflation subspace generation"); + read_line("kappa","%lf",&kappa); + read_line("mu","%lf",&mu); + read_line("ninv","%d",&ninv); + read_line("nmr","%d",&nmr); + read_line("ncy","%d",&ncy); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&ninv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_gen_parms(kappa,mu,ninv,nmr,ncy); + + if (my_rank==0) + { + find_section("Deflation projection"); + read_line("nkv","%d",&nkv); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + } + + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + set_dfl_pro_parms(nkv,nmx,res); +} + + +static void read_solver(void) +{ + solver_parms_t sp; + + read_solver_parms(0); + sp=solver_parms(0); + + if ((sp.solver==SAP_GCR)||(sp.solver==DFL_SAP_GCR)) + read_sap_parms(); + + if (sp.solver==DFL_SAP_GCR) + read_dfl_parms(); +} + + +static void read_infile(int argc,char *argv[]) +{ + int ifile; + + if (my_rank==0) + { + flog=freopen("STARTUP_ERROR","w",stdout); + + ifile=find_opt(argc,argv,"-i"); + endian=endianness(); + + error_root((ifile==0)||(ifile==(argc-1)),1,"read_infile [ms4.c]", + "Syntax: ms4 -i [-noexp]"); + + error_root(endian==UNKNOWN_ENDIAN,1,"read_infile [ms4.c]", + "Machine has unknown endianness"); + + noexp=find_opt(argc,argv,"-noexp"); + + fin=freopen(argv[ifile+1],"r",stdin); + error_root(fin==NULL,1,"read_infile [ms4.c]", + "Unable to open input file"); + } + + MPI_Bcast(&endian,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&noexp,1,MPI_INT,0,MPI_COMM_WORLD); + + read_dirs(); + setup_files(); + read_lat_parms(); + read_bc_parms(); + read_solver(); + + if (my_rank==0) + fclose(fin); +} + + +static void check_files(void) +{ + if (my_rank==0) + { + fin=fopen(log_file,"r"); + error_root(fin!=NULL,1,"check_files [ms4.c]", + "Attempt to overwrite old *.log file"); + } +} + + +static void print_info(void) +{ + int isap,idfl,n[3]; + long ip; + + if (my_rank==0) + { + ip=ftell(flog); + fclose(flog); + + if (ip==0L) + remove("STARTUP_ERROR"); + + flog=freopen(log_file,"w",stdout); + error_root(flog==NULL,1,"print_info [ms4.c]","Unable to open log file"); + printf("\n"); + + printf("Computation of quark propagators\n"); + printf("--------------------------------\n\n"); + + printf("Program version %s\n",openQCD_RELEASE); + + if (endian==LITTLE_ENDIAN) + printf("The machine is little endian\n"); + else + printf("The machine is big endian\n"); + if (noexp) + printf("Configurations are read in imported file format\n\n"); + else + printf("Configurations are read in exported file format\n\n"); + + printf("%dx%dx%dx%d lattice, ",N0,N1,N2,N3); + printf("%dx%dx%dx%d local lattice\n",L0,L1,L2,L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d process block size\n", + NPROC0_BLK,NPROC1_BLK,NPROC2_BLK,NPROC3_BLK); + printf("SF boundary conditions on the quark fields\n\n"); + + printf("Random number generator:\n"); + printf("level = %d, seed = %d\n\n",level,seed); + + printf("Dirac operator:\n"); + n[0]=fdigits(lat.kappa[0]); + printf("kappa = %.*f\n",IMAX(n[0],6),lat.kappa[0]); + n[0]=fdigits(mus); + printf("mu = %.*f\n",IMAX(n[0],1),mus); + n[0]=fdigits(lat.csw); + printf("csw = %.*f\n\n",IMAX(n[0],1),lat.csw); + + if (bcp.type==0) + { + printf("Open boundary conditions\n"); + + n[0]=fdigits(bcp.cF[0]); + printf("cF = %.*f\n\n",IMAX(n[0],1),bcp.cF[0]); + } + else if (bcp.type==1) + { + printf("SF boundary conditions\n"); + + n[0]=fdigits(bcp.cF[0]); + printf("cF = %.*f\n",IMAX(n[0],1),bcp.cF[0]); + + n[0]=fdigits(bcp.phi[0][0]); + n[1]=fdigits(bcp.phi[0][1]); + n[2]=fdigits(bcp.phi[0][2]); + printf("phi = %.*f,%.*f,%.*f\n",IMAX(n[0],1),bcp.phi[0][0], + IMAX(n[1],1),bcp.phi[0][1],IMAX(n[2],1),bcp.phi[0][2]); + + n[0]=fdigits(bcp.phi[1][0]); + n[1]=fdigits(bcp.phi[1][1]); + n[2]=fdigits(bcp.phi[1][2]); + printf("phi' = %.*f,%.*f,%.*f\n\n",IMAX(n[0],1),bcp.phi[1][0], + IMAX(n[1],1),bcp.phi[1][1],IMAX(n[2],1),bcp.phi[1][2]); + } + else if (bcp.type==2) + { + printf("Open-SF boundary conditions\n"); + + n[0]=fdigits(bcp.cF[0]); + printf("cF = %.*f\n",IMAX(n[0],1),bcp.cF[0]); + n[1]=fdigits(bcp.cF[1]); + printf("cF' = %.*f\n",IMAX(n[1],1),bcp.cF[1]); + + n[0]=fdigits(bcp.phi[1][0]); + n[1]=fdigits(bcp.phi[1][1]); + n[2]=fdigits(bcp.phi[1][2]); + printf("phi' = %.*f,%.*f,%.*f\n\n",IMAX(n[0],1),bcp.phi[1][0], + IMAX(n[1],1),bcp.phi[1][1],IMAX(n[2],1),bcp.phi[1][2]); + } + else + printf("Periodic boundary conditions\n\n"); + + printf("Source fields:\n"); + printf("x0 = %d\n",x0); + printf("nsrc = %d\n\n",nsrc); + + print_solver_parms(&isap,&idfl); + + if (isap) + print_sap_parms(0); + + if (idfl) + print_dfl_parms(0); + + printf("Configurations no %d -> %d in steps of %d\n\n", + first,last,step); + fflush(flog); + } +} + + +static void dfl_wsize(int *nws,int *nwv,int *nwvd) +{ + dfl_parms_t dp; + dfl_pro_parms_t dpp; + + dp=dfl_parms(); + dpp=dfl_pro_parms(); + + MAX(*nws,dp.Ns+2); + MAX(*nwv,2*dpp.nkv+2); + MAX(*nwvd,4); +} + + +static void wsize(int *nws,int *nwsd,int *nwv,int *nwvd) +{ + int nsd; + solver_parms_t sp; + + (*nws)=0; + (*nwsd)=0; + (*nwv)=0; + (*nwvd)=0; + + sp=solver_parms(0); + nsd=2; + + if (sp.solver==CGNE) + { + MAX(*nws,5); + MAX(*nwsd,nsd+3); + } + if (sp.solver==SAP_GCR) + { + MAX(*nws,2*sp.nkv+1); + MAX(*nwsd,nsd+2); + } + else if (sp.solver==DFL_SAP_GCR) + { + MAX(*nws,2*sp.nkv+2); + MAX(*nwsd,nsd+4); + dfl_wsize(nws,nwv,nwvd); + } + else + error_root(1,1,"wsize [ms4.c]", + "Unknown or unsupported solver"); +} + + +static void random_source(spinor_dble *eta) +{ + int y0,iy,ix; + + set_sd2zero(VOLUME,eta); + y0=x0-cpr[0]*L0; + + if ((y0>=0)&&(y0 [-noloc] [-noexp] [-rmold] [-noms] +* [-c [-a [-norng]]] +* +* For usage instructions see the file README.qcd1. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "archive.h" +#include "forces.h" +#include "update.h" +#include "wflow.h" +#include "tcharge.h" +#include "version.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +typedef struct +{ + int nt,iac; + double dH,avpl; +} dat_t; + +static struct +{ + int dn,nn,tmax; + double eps; +} file_head; + +static struct +{ + int nt; + double **Wsl,**Ysl,**Qsl; +} data; + +static int my_rank,noloc,noexp,rmold,noms,norng; +static int scnfg,append,endian; +static int level,seed; +static int nth,ntr,dtr_log,dtr_ms,dtr_cnfg; +static int ipgrd[2],flint; +static double *Wact,*Yact,*Qtop; + +static char line[NAME_SIZE]; +static char log_dir[NAME_SIZE],dat_dir[NAME_SIZE]; +static char loc_dir[NAME_SIZE],cnfg_dir[NAME_SIZE]; +static char log_file[NAME_SIZE],log_save[NAME_SIZE]; +static char par_file[NAME_SIZE],par_save[NAME_SIZE]; +static char dat_file[NAME_SIZE],dat_save[NAME_SIZE]; +static char msdat_file[NAME_SIZE],msdat_save[NAME_SIZE]; +static char rng_file[NAME_SIZE],rng_save[NAME_SIZE]; +static char cnfg_file[NAME_SIZE],end_file[NAME_SIZE]; +static char nbase[NAME_SIZE],cnfg[NAME_SIZE]; +static FILE *fin=NULL,*flog=NULL,*fdat=NULL,*fend=NULL; + +static hmc_parms_t hmc; + + +static int write_dat(int n,dat_t *ndat) +{ + int i,iw,ic; + stdint_t istd[2]; + double dstd[2]; + + ic=0; + + for (i=0;i=NAME_SIZE,1, + "setup_files [qcd1.c]","log_dir name is too long"); + error_root(name_size("%s/%s.ms.dat~",dat_dir,nbase)>=NAME_SIZE,1, + "setup_files [qcd1.c]","dat_dir name is too long"); + + sprintf(log_file,"%s/%s.log",log_dir,nbase); + sprintf(par_file,"%s/%s.par",dat_dir,nbase); + sprintf(dat_file,"%s/%s.dat",dat_dir,nbase); + sprintf(msdat_file,"%s/%s.ms.dat",dat_dir,nbase); + sprintf(rng_file,"%s/%s.rng",dat_dir,nbase); + sprintf(end_file,"%s/%s.end",log_dir,nbase); + sprintf(log_save,"%s~",log_file); + sprintf(par_save,"%s~",par_file); + sprintf(dat_save,"%s~",dat_file); + sprintf(msdat_save,"%s~",msdat_file); + sprintf(rng_save,"%s~",rng_file); +} + + +static void read_lat_parms(void) +{ + int nk; + double beta,c0,csw,*kappa; + + if (my_rank==0) + { + find_section("Lattice parameters"); + read_line("beta","%lf",&beta); + read_line("c0","%lf",&c0); + nk=count_tokens("kappa"); + read_line("csw","%lf",&csw); + } + + MPI_Bcast(&beta,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&c0,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&nk,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&csw,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + if (nk>0) + { + kappa=malloc(nk*sizeof(*kappa)); + error(kappa==NULL,1,"read_lat_parms [qcd1.c]", + "Unable to allocate parameter array"); + if (my_rank==0) + read_dprms("kappa",nk,kappa); + MPI_Bcast(kappa,nk,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + kappa=NULL; + + set_lat_parms(beta,c0,nk,kappa,csw); + + if (nk>0) + free(kappa); + + if (append) + check_lat_parms(fdat); + else + write_lat_parms(fdat); +} + + +static void read_bc_parms(void) +{ + int bc; + double cG,cG_prime,cF,cF_prime; + double phi[2],phi_prime[2]; + + if (my_rank==0) + { + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + cG=1.0; + cG_prime=1.0; + cF=1.0; + cF_prime=1.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + if (bc!=3) + { + read_line("cG","%lf",&cG); + read_line("cF","%lf",&cF); + } + + if (bc==2) + { + read_line("cG'","%lf",&cG_prime); + read_line("cF'","%lf",&cF_prime); + } + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cG,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cG_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + set_bc_parms(bc,cG,cG_prime,cF,cF_prime,phi,phi_prime); + + if (append) + check_bc_parms(fdat); + else + write_bc_parms(fdat); +} + + +static void read_schedule(void) +{ + int ie,ir,iw; + stdint_t istd[3]; + + if (my_rank==0) + { + find_section("MD trajectories"); + read_line("nth","%d",&nth); + read_line("ntr","%d",&ntr); + read_line("dtr_log","%d",&dtr_log); + if (noms==0) + read_line("dtr_ms","%d",&dtr_ms); + else + dtr_ms=0; + read_line("dtr_cnfg","%d",&dtr_cnfg); + + error_root((append!=0)&&(nth!=0),1,"read_schedule [qcd1.c]", + "Continuation run: nth must be equal to zero"); + + ie=0; + ie|=(nth<0); + ie|=(ntr<1); + ie|=(dtr_log<1); + ie|=(dtr_log>dtr_cnfg); + ie|=((dtr_cnfg%dtr_log)!=0); + ie|=((nth%dtr_cnfg)!=0); + ie|=((ntr%dtr_cnfg)!=0); + + if (noms==0) + { + ie|=(dtr_msdtr_cnfg); + ie|=((dtr_ms%dtr_log)!=0); + ie|=((dtr_cnfg%dtr_ms)!=0); + } + + error_root(ie!=0,1,"read_schedule [qcd1.c]", + "Improper value of nth,ntr,dtr_log,dtr_ms or dtr_cnfg"); + } + + MPI_Bcast(&nth,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ntr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&dtr_log,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&dtr_ms,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&dtr_cnfg,1,MPI_INT,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + if (append) + { + ir=fread(istd,sizeof(stdint_t),3,fdat); + error_root(ir!=3,1,"read_schedule [qcd1.c]", + "Incorrect read count"); + + if (endian==BIG_ENDIAN) + bswap_int(3,istd); + + ie=0; + ie|=(istd[0]!=(stdint_t)(dtr_log)); + ie|=(istd[1]!=(stdint_t)(dtr_ms)); + ie|=(istd[2]!=(stdint_t)(dtr_cnfg)); + + error_root(ie!=0,1,"read_schedule [qcd1.c]", + "Parameters do not match previous run"); + } + else + { + istd[0]=(stdint_t)(dtr_log); + istd[1]=(stdint_t)(dtr_ms); + istd[2]=(stdint_t)(dtr_cnfg); + + if (endian==BIG_ENDIAN) + bswap_int(3,istd); + + iw=fwrite(istd,sizeof(stdint_t),3,fdat); + error_root(iw!=3,1,"read_schedule [qcd1.c]", + "Incorrect write count"); + } + } +} + + +static void read_actions(void) +{ + int i,k,l,nact,*iact; + int npf,nlv,nmu; + double tau,*mu; + action_parms_t ap; + rat_parms_t rp; + + if (my_rank==0) + { + find_section("HMC parameters"); + nact=count_tokens("actions"); + read_line("npf","%d",&npf); + read_line("nlv","%d",&nlv); + read_line("tau","%lf",&tau); + } + + MPI_Bcast(&nact,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&npf,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nlv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&tau,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + if (nact>0) + { + iact=malloc(nact*sizeof(*iact)); + error(iact==NULL,1,"read_actions [qcd1.c]", + "Unable to allocate temporary array"); + if (my_rank==0) + read_iprms("actions",nact,iact); + MPI_Bcast(iact,nact,MPI_INT,0,MPI_COMM_WORLD); + } + else + iact=NULL; + + nmu=0; + + for (i=0;i0) + { + mu=malloc(nmu*sizeof(*mu)); + error(mu==NULL,1,"read_actions [qcd1.c]", + "Unable to allocate temporary array"); + + if (my_rank==0) + { + find_section("HMC parameters"); + read_dprms("mu",nmu,mu); + } + + MPI_Bcast(mu,nmu,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + mu=NULL; + + hmc=set_hmc_parms(nact,iact,npf,nmu,mu,nlv,tau); + + if (nact>0) + free(iact); + if (nmu>0) + free(mu); + + if (append) + { + check_hmc_parms(fdat); + check_action_parms(fdat); + } + else + { + write_hmc_parms(fdat); + write_action_parms(fdat); + } +} + + +static void read_integrator(void) +{ + int nlv,i,j,k,l; + mdint_parms_t mdp; + force_parms_t fp; + rat_parms_t rp; + + nlv=hmc.nlv; + + for (i=0;i [-noloc] [-noexp] " + "[-rmold] [-noms] [-c [-a [-norng]]]"); + + error_root(endian==UNKNOWN_ENDIAN,1,"read_infile [qcd1.c]", + "Machine has unknown endianness"); + + error_root((noexp)&&(noloc),1,"read_infile [qcd1.c]", + "The concurrent use of -noloc and -noexp is not permitted"); + + if (scnfg) + { + strncpy(cnfg,argv[scnfg+1],NAME_SIZE-1); + cnfg[NAME_SIZE-1]='\0'; + } + else + cnfg[0]='\0'; + + fin=freopen(argv[ifile+1],"r",stdin); + error_root(fin==NULL,1,"read_infile [qcd1.c]", + "Unable to open input file"); + } + + MPI_Bcast(&noloc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&noexp,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&rmold,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&noms,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&scnfg,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&append,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&norng,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&endian,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(cnfg,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + find_section("Random number generator"); + read_line("level","%d",&level); + read_line("seed","%d",&seed); + } + + MPI_Bcast(&level,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&seed,1,MPI_INT,0,MPI_COMM_WORLD); + + read_dirs(); + setup_files(); + + if (my_rank==0) + { + if (append) + fdat=fopen(par_file,"rb"); + else + fdat=fopen(par_file,"wb"); + + error_root(fdat==NULL,1,"read_infile [qcd1.c]", + "Unable to open parameter file"); + } + + read_lat_parms(); + read_bc_parms(); + read_schedule(); + read_actions(); + read_integrator(); + read_solvers(); + read_wflow_parms(); + + if (my_rank==0) + { + fclose(fin); + fclose(fdat); + + if (append==0) + copy_file(par_file,par_save); + } +} + + +static void check_old_log(int ic,int *nl,int *icnfg) +{ + int ir,isv; + int np[4],bp[4]; + + fend=fopen(log_file,"r"); + error_root(fend==NULL,1,"check_old_log [qcd1.c]", + "Unable to open log file"); + (*nl)=0; + (*icnfg)=0; + ir=1; + isv=0; + + while (fgets(line,NAME_SIZE,fend)!=NULL) + { + if (strstr(line,"process grid")!=NULL) + { + ir&=(sscanf(line,"%dx%dx%dx%d process grid, %dx%dx%dx%d", + np,np+1,np+2,np+3,bp,bp+1,bp+2,bp+3)==8); + + ipgrd[0]=((np[0]!=NPROC0)||(np[1]!=NPROC1)|| + (np[2]!=NPROC2)||(np[3]!=NPROC3)); + ipgrd[1]=((bp[0]!=NPROC0_BLK)||(bp[1]!=NPROC1_BLK)|| + (bp[2]!=NPROC2_BLK)||(bp[3]!=NPROC3_BLK)); + } + else if (strstr(line,"Trajectory no")!=NULL) + { + ir&=(sscanf(line,"Trajectory no %d",nl)==1); + isv=0; + } + else if (strstr(line,"Configuration no")!=NULL) + { + ir&=(sscanf(line,"Configuration no %d",icnfg)==1); + isv=1; + } + } + + fclose(fend); + + error_root(ir!=1,1,"check_old_log [qcd1.c]","Incorrect read count"); + + error_root(ic!=(*icnfg),1,"check_old_log [qcd1.c]", + "Continuation run:\n" + "Initial configuration is not the last one of the previous run"); + + error_root(isv==0,1,"check_old_log [qcd1.c]", + "Continuation run:\n" + "The log file extends beyond the last configuration save"); +} + + +static void check_old_dat(int nl) +{ + int nt; + dat_t ndat; + + fdat=fopen(dat_file,"rb"); + error_root(fdat==NULL,1,"check_old_dat [qcd1.c]", + "Unable to open data file"); + nt=0; + + while (read_dat(1,&ndat)==1) + nt=ndat.nt; + + fclose(fdat); + + error_root(nt!=nl,1,"check_old_dat [qcd1.c]", + "Continuation run: Incomplete or too many data records"); +} + + +static void check_old_msdat(int nl) +{ + int ic,ir,nt,pnt,dnt; + + fdat=fopen(msdat_file,"rb"); + error_root(fdat==NULL,1,"check_old_msdat [qcd1.c]", + "Unable to open data file"); + + check_file_head(); + + nt=0; + dnt=0; + pnt=0; + + for (ic=0;;ic++) + { + ir=read_data(); + + if (ir==0) + { + error_root(ic==0,1,"check_old_msdat [qcd1.c]", + "No data records found"); + break; + } + + nt=data.nt; + + if (ic==1) + { + dnt=nt-pnt; + error_root(dnt<1,1,"check_old_msdat [qcd1.c]", + "Incorrect trajectory separation"); + } + else if (ic>1) + error_root(nt!=(pnt+dnt),1,"check_old_msdat [qcd1.c]", + "Trajectory sequence is not equally spaced"); + + pnt=nt; + } + + fclose(fdat); + + error_root((nt!=nl)||((ic>1)&&(dnt!=dtr_ms)),1, + "check_old_msdat [qcd1.c]","Last trajectory numbers " + "or the trajectory separations do not match"); +} + + +static void check_files(int *nl,int *icnfg) +{ + int icmax,ic; + + ipgrd[0]=0; + ipgrd[1]=0; + + if (my_rank==0) + { + if (noloc) + error_root(cnfg[strlen(cnfg)-1]=='*',1, + "check_files [qcd1.c]","Attempt to read an " + "imported configuration when -noloc is set"); + + if (append) + { + error_root(strstr(cnfg,nbase)!=cnfg,1,"check_files [qcd1.c]", + "Continuation run:\n" + "Run name does not match the previous one"); + error_root(sscanf(cnfg+strlen(nbase),"n%d",&ic)!=1,1, + "check_files [qcd1.c]","Continuation run:\n" + "Unable to read configuration number from file name"); + + check_old_log(ic,nl,icnfg); + check_old_dat(*nl); + if (noms==0) + check_old_msdat(*nl); + + (*icnfg)+=1; + } + else + { + fin=fopen(log_file,"r"); + fdat=fopen(dat_file,"rb"); + + if (noms==0) + fend=fopen(msdat_file,"rb"); + else + fend=NULL; + + error_root((fin!=NULL)||(fdat!=NULL)||(fend!=NULL),1, + "check_files [qcd1.c]", + "Attempt to overwrite old *.log or *.dat file"); + + if (noms==0) + { + fdat=fopen(msdat_file,"wb"); + error_root(fdat==NULL,1,"check_files [qcd1.c]", + "Unable to open measurement data file"); + write_file_head(); + fclose(fdat); + } + + (*nl)=0; + (*icnfg)=1; + } + + icmax=(*icnfg)+(ntr-nth)/dtr_cnfg; + + if (noloc==0) + error_root(name_size("%s/%sn%d_%d",loc_dir,nbase,icmax,NPROC-1)>= + NAME_SIZE,1,"check_files [qcd1.c]", + "loc_dir name is too long"); + + if (noexp==0) + error_root(name_size("%s/%sn%d",cnfg_dir,nbase,icmax)>=NAME_SIZE,1, + "check_files [qcd1.c]","cnfg_dir name is too long"); + + if (scnfg) + { + if (cnfg[strlen(cnfg)-1]=='*') + error_root(name_size("%s/%s%d",loc_dir,cnfg,NPROC-1)>=NAME_SIZE,1, + "check_files [qcd1.c]","loc_dir name is too long"); + else + error_root(name_size("%s/%s",cnfg_dir,cnfg)>=NAME_SIZE,1, + "check_files [qcd1.c]","cnfg_dir name is too long"); + } + } + + MPI_Bcast(nl,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(icnfg,1,MPI_INT,0,MPI_COMM_WORLD); +} + + +static void init_ud(void) +{ + char *p; + + if (scnfg) + { + if (cnfg[strlen(cnfg)-1]!='*') + { + sprintf(cnfg_file,"%s/%s",cnfg_dir,cnfg); + import_cnfg(cnfg_file); + } + else + { + sprintf(line,"%s/%s",loc_dir,cnfg); + p=line+strlen(line)-1; + p[0]='\0'; + sprintf(cnfg_file,"%s_%d",line,my_rank); + read_cnfg(cnfg_file); + } + } + else + random_ud(); +} + + +static void init_rng(int icnfg) +{ + int ic; + + if (append) + { + if (cnfg[strlen(cnfg)-1]!='*') + { + if (norng) + start_ranlux(level,seed^(icnfg-1)); + else + { + ic=import_ranlux(rng_file); + error_root(ic!=(icnfg-1),1,"init_rng [qcd1.c]", + "Configuration number mismatch (*.rng file)"); + } + } + } + else + start_ranlux(level,seed); +} + + +static void store_ud(su3_dble *usv) +{ + su3_dble *udb; + + udb=udfld(); + cm3x3_assign(4*VOLUME,udb,usv); +} + + +static void recall_ud(su3_dble *usv) +{ + su3_dble *udb; + + udb=udfld(); + cm3x3_assign(4*VOLUME,usv,udb); + set_flags(UPDATED_UD); +} + + +static void set_data(int nt) +{ + int in,dn,nn; + double eps; + + data.nt=nt; + dn=file_head.dn; + nn=file_head.nn; + eps=file_head.eps; + + for (in=0;in0); + dn=file_head.dn; + nn=file_head.nn; + eps=file_head.eps; + + din=nn/10; + if (din<1) + din=1; + + printf("Measurement run:\n\n"); + + for (in=0;in<=nn;in+=din) + printf("n = %3d, t = %.2e, Wact = %.6e, Yact = %.6e, Q = % .2e\n", + in*dn,eps*(double)(in*dn),Wact[in],Yact[in],Qtop[in]); + + printf("\n"); + printf("Configuration fully processed in %.2e sec ",wtms); + printf("(average = %.2e sec)\n",wtmsall/(double)(nms)); + printf("Measured data saved\n\n"); + fflush(flog); + } +} + + +static void save_cnfg(int icnfg) +{ + int ie; + + ie=check_bc(0.0)^0x1; + ie|=chs_ubnd(1); + error_root(ie!=0,1,"save_cnfg [qcd1.c]","Unexpected boundary values"); + + if (noloc==0) + { + sprintf(cnfg_file,"%s/%sn%d_%d",loc_dir,nbase,icnfg,my_rank); + write_cnfg(cnfg_file); + } + + if (noexp==0) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + export_cnfg(cnfg_file); + } + + if (my_rank==0) + { + if ((noloc==0)&&(noexp==0)) + printf("Configuration no %d saved on the local disks " + "and exported\n\n",icnfg); + else if (noloc==0) + printf("Configuration no %d saved on the local disks\n\n",icnfg); + else if (noexp==0) + printf("Configuration no %d exported\n\n",icnfg); + } +} + + +static void check_endflag(int *iend) +{ + if (my_rank==0) + { + fend=fopen(end_file,"r"); + + if (fend!=NULL) + { + fclose(fend); + remove(end_file); + (*iend)=1; + printf("End flag set, run stopped\n\n"); + } + else + (*iend)=0; + } + + MPI_Bcast(iend,1,MPI_INT,0,MPI_COMM_WORLD); +} + + +static void remove_cnfg(int icnfg) +{ + if ((rmold)&&(icnfg>=1)) + { + if (noloc==0) + { + sprintf(cnfg_file,"%s/%sn%d_%d",loc_dir,nbase,icnfg,my_rank); + remove(cnfg_file); + } + + if ((noexp==0)&&(my_rank==0)) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + remove(cnfg_file); + } + } +} + + +int main(int argc,char *argv[]) +{ + int nl,icnfg; + int nwud,nws,nwsd,nwv,nwvd; + int n,iend,iac,i; + double *act0,*act1,w0[2],w1[2],npl,siac; + double wt1,wt2,wtcyc,wtall,wtms,wtmsall; + su3_dble **usv; + dat_t ndat; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + read_infile(argc,argv); + if (noms==0) + alloc_data(); + check_files(&nl,&icnfg); + geometry(); + + hmc_wsize(&nwud,&nws,&nwsd,&nwv,&nwvd); + alloc_wud(nwud); + alloc_ws(nws); + alloc_wsd(nwsd); + alloc_wv(nwv); + alloc_wvd(nwvd); + if ((noms==0)&&(flint)) + alloc_wfd(1); + + act0=malloc(2*(hmc.nact+1)*sizeof(*act0)); + act1=act0+hmc.nact+1; + error(act0==NULL,1,"main [qcd1.c]","Unable to allocate action arrays"); + + print_info(icnfg); + hmc_sanity_check(); + set_mdsteps(); + setup_counters(); + setup_chrono(); + init_ud(); + init_rng(icnfg); + + if (bc_type()==0) + npl=(double)(6*(N0-1)*N1)*(double)(N2*N3); + else + npl=(double)(6*N0*N1)*(double)(N2*N3); + + iend=0; + siac=0.0; + wtcyc=0.0; + wtall=0.0; + wtms=0.0; + wtmsall=0.0; + + for (n=0;(iend==0)&&(n=nth)&&(((ntr-n-1)%dtr_ms)==0)) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + usv=reserve_wud(1); + store_ud(usv[0]); + set_data(nl+n+1); + recall_ud(usv[0]); + release_wud(); + + MPI_Barrier(MPI_COMM_WORLD); + wt2=MPI_Wtime(); + + wtms=wt2-wt1; + wtmsall+=wtms; + save_msdat(n,wtms,wtmsall); + } + } + + if (((n+1)>=nth)&&(((ntr-n-1)%dtr_cnfg)==0)) + { + save_cnfg(icnfg); + export_ranlux(icnfg,rng_file); + check_endflag(&iend); + error_chk(); + + if (my_rank==0) + { + fflush(flog); + copy_file(log_file,log_save); + copy_file(dat_file,dat_save); + if (noms==0) + copy_file(msdat_file,msdat_save); + copy_file(rng_file,rng_save); + } + + remove_cnfg(icnfg-1); + icnfg+=1; + } + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/qcd1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/qcd1.in new file mode 100644 index 0000000000000000000000000000000000000000..303c84de3d0ab82aadc3964366c51c21925eb9a9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/qcd1.in @@ -0,0 +1,145 @@ + +[Run name] +name Snoopy137 + +[Directories] +log_dir ../data/qcd1/log +dat_dir ../data/qcd1/dat +loc_dir /ndata/qcd1/cnfg +cnfg_dir /data/qcd1/cnfg + +[Random number generator] +level 0 +seed 73099 + +[Lattice parameters] +beta 6.0 +c0 1.6667 +kappa 0.1300 +csw 1.234 + +[Boundary conditions] +type 2 +phi 0.12 -0.56 +phi' 0.92 0.76 +cG 1.10 +cG' 1.05 +cF 0.95 +cF' 0.90 + +[HMC parameters] +actions 0 1 2 +npf 2 +mu 0.01 1.0 +nlv 3 +tau 0.5 + +[MD trajectories] +nth 320 +ntr 32000 +dtr_log 4 +dtr_ms 8 +dtr_cnfg 32 + +[Level 0] +integrator OMF4 +nstep 1 +forces 0 + +[Level 1] +integrator OMF2 +lambda 0.2 +nstep 2 +forces 1 + +[Level 2] +integrator LPFR +nstep 6 +forces 2 + +[Action 0] +action ACG + +[Action 1] +action ACF_TM1 +ipf 0 +im0 0 +imu 1 +isp 0 + +[Action 2] +action ACF_TM2 +ipf 1 +im0 0 +imu 0 1 +isp 1 0 + +[Force 0] +force FRG + +[Force 1] +force FRF_TM1 +isp 2 +ncr 4 + +[Force 2] +force FRF_TM2 +isp 3 +ncr 0 + +[Solver 0] +solver CGNE +nmx 256 +res 1.0e-10 + +[Solver 1] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 24 +res 1.0e-10 + +[Solver 2] +solver CGNE +nmx 256 +res 1.0e-8 + +[Solver 3] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 24 +res 1.0e-8 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.1350 +mu 0.01 +ninv 5 +nmr 4 +ncy 5 + +[Deflation projection] +nkv 24 +nmx 512 +res 1.0e-2 + +[Deflation update scheme] +dtau 0.3 +nsm 1 + +[Wilson flow] +integrator RK3 +eps 2.0e-2 +nstep 100 +dnms 10 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ym1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ym1.c new file mode 100644 index 0000000000000000000000000000000000000000..676122ea15fea2d4c75922e720e7c9449055e087 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ym1.c @@ -0,0 +1,1639 @@ + +/******************************************************************************* +* +* File ym1.c +* +* Copyright (C) 2010-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* HMC simulation program for the SU(3) gauge theory. +* +* Syntax: ym1 -i [-noloc] [-noexp] [-rmold] [-noms] +* [-c [-a [-norng]]] +* +* For usage instructions see the file README.ym1. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "archive.h" +#include "forces.h" +#include "update.h" +#include "wflow.h" +#include "tcharge.h" +#include "version.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +typedef struct +{ + int nt,iac; + double dH,avpl; +} dat_t; + +static struct +{ + int dn,nn,tmax; + double eps; +} file_head; + +static struct +{ + int nt; + double **Wsl,**Ysl,**Qsl; +} data; + +static int my_rank,noloc,noexp,rmold,noms,norng; +static int scnfg,append,endian; +static int level,seed; +static int nth,ntr,dtr_log,dtr_ms,dtr_cnfg; +static int ipgrd[2],flint; +static double *Wact,*Yact,*Qtop; + +static char line[NAME_SIZE]; +static char log_dir[NAME_SIZE],dat_dir[NAME_SIZE]; +static char loc_dir[NAME_SIZE],cnfg_dir[NAME_SIZE]; +static char log_file[NAME_SIZE],log_save[NAME_SIZE]; +static char par_file[NAME_SIZE],par_save[NAME_SIZE]; +static char dat_file[NAME_SIZE],dat_save[NAME_SIZE]; +static char msdat_file[NAME_SIZE],msdat_save[NAME_SIZE]; +static char rng_file[NAME_SIZE],rng_save[NAME_SIZE]; +static char cnfg_file[NAME_SIZE],end_file[NAME_SIZE]; +static char nbase[NAME_SIZE],cnfg[NAME_SIZE]; +static FILE *fin=NULL,*flog=NULL,*fdat=NULL,*fend=NULL; + +static lat_parms_t lat; +static bc_parms_t bcp; +static hmc_parms_t hmc; + + +static int write_dat(int n,dat_t *ndat) +{ + int i,iw,ic; + stdint_t istd[2]; + double dstd[2]; + + ic=0; + + for (i=0;i=NAME_SIZE,1, + "setup_files [ym1.c]","log_dir name is too long"); + error_root(name_size("%s/%s.ms.dat~",dat_dir,nbase)>=NAME_SIZE,1, + "setup_files [ym1.c]","dat_dir name is too long"); + + sprintf(log_file,"%s/%s.log",log_dir,nbase); + sprintf(par_file,"%s/%s.par",dat_dir,nbase); + sprintf(dat_file,"%s/%s.dat",dat_dir,nbase); + sprintf(msdat_file,"%s/%s.ms.dat",dat_dir,nbase); + sprintf(rng_file,"%s/%s.rng",dat_dir,nbase); + sprintf(end_file,"%s/%s.end",log_dir,nbase); + sprintf(log_save,"%s~",log_file); + sprintf(par_save,"%s~",par_file); + sprintf(dat_save,"%s~",dat_file); + sprintf(msdat_save,"%s~",msdat_file); + sprintf(rng_save,"%s~",rng_file); +} + + +static void read_lat_parms(void) +{ + double beta,c0; + + if (my_rank==0) + { + find_section("Lattice parameters"); + read_line("beta","%lf",&beta); + read_line("c0","%lf",&c0); + } + + MPI_Bcast(&beta,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&c0,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + lat=set_lat_parms(beta,c0,0,NULL,1.0); + + if (append) + check_lat_parms(fdat); + else + write_lat_parms(fdat); +} + + +static void read_bc_parms(void) +{ + int bc; + double cG,cG_prime; + double phi[2],phi_prime[2]; + + if (my_rank==0) + { + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + cG=1.0; + cG_prime=1.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + if (bc!=3) + read_line("cG","%lf",&cG); + + if (bc==2) + read_line("cG'","%lf",&cG_prime); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cG,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cG_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + bcp=set_bc_parms(bc,cG,cG_prime,1.0,1.0,phi,phi_prime); + + if (append) + check_bc_parms(fdat); + else + write_bc_parms(fdat); +} + + +static void read_hmc_parms(void) +{ + int iact[1]; + double tau; + + if (my_rank==0) + { + find_section("Trajectory length"); + read_line("tau","%lf",&tau); + } + + MPI_Bcast(&tau,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + iact[0]=0; + hmc=set_hmc_parms(1,iact,0,0,NULL,1,tau); + + if (append) + check_hmc_parms(fdat); + else + write_hmc_parms(fdat); +} + + +static void read_integrator(void) +{ + int nstep,imd,ifr[1]; + double lambda; + + if (my_rank==0) + { + find_section("MD integrator"); + read_line("integrator","%s",line); + lambda=0.0; + + if (strcmp(line,"LPFR")==0) + imd=(int)(LPFR); + else if (strcmp(line,"OMF2")==0) + { + imd=(int)(OMF2); + read_line("lambda","%lf",&lambda); + } + else if (strcmp(line,"OMF4")==0) + imd=(int)(OMF4); + else + error_root(1,1,"read_integrator [ym1.c]","Unknown integrator"); + + read_line("nstep","%d",&nstep); + } + + MPI_Bcast(&imd,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&lambda,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&nstep,1,MPI_INT,0,MPI_COMM_WORLD); + + ifr[0]=0; + + if (imd==(int)(LPFR)) + set_mdint_parms(0,LPFR,lambda,nstep,1,ifr); + else if (imd==(int)(OMF2)) + set_mdint_parms(0,OMF2,lambda,nstep,1,ifr); + else if (imd==(int)(OMF4)) + set_mdint_parms(0,OMF4,lambda,nstep,1,ifr); + + set_action_parms(0,ACG,0,0,NULL,NULL,NULL); + set_force_parms(0,FRG,0,0,NULL,NULL,NULL,NULL); + + if (append) + { + check_mdint_parms(fdat); + check_action_parms(fdat); + check_force_parms(fdat); + } + else + { + write_mdint_parms(fdat); + write_action_parms(fdat); + write_force_parms(fdat); + } +} + + +static void read_schedule(void) +{ + int ie,ir,iw; + stdint_t istd[3]; + + if (my_rank==0) + { + find_section("MD trajectories"); + read_line("nth","%d",&nth); + read_line("ntr","%d",&ntr); + read_line("dtr_log","%d",&dtr_log); + if (noms==0) + read_line("dtr_ms","%d",&dtr_ms); + else + dtr_ms=0; + read_line("dtr_cnfg","%d",&dtr_cnfg); + + error_root((append!=0)&&(nth!=0),1,"read_schedule [ym1.c]", + "Continuation run: nth must be equal to zero"); + + ie=0; + ie|=(nth<0); + ie|=(ntr<1); + ie|=(dtr_log<1); + ie|=(dtr_log>dtr_cnfg); + ie|=((dtr_cnfg%dtr_log)!=0); + ie|=((nth%dtr_cnfg)!=0); + ie|=((ntr%dtr_cnfg)!=0); + + if (noms==0) + { + ie|=(dtr_msdtr_cnfg); + ie|=((dtr_ms%dtr_log)!=0); + ie|=((dtr_cnfg%dtr_ms)!=0); + } + + error_root(ie!=0,1,"read_schedule [ym1.c]", + "Improper value of nth,ntr,dtr_log,dtr_ms or dtr_cnfg"); + } + + MPI_Bcast(&nth,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ntr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&dtr_log,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&dtr_ms,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&dtr_cnfg,1,MPI_INT,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + if (append) + { + ir=fread(istd,sizeof(stdint_t),3,fdat); + error_root(ir!=3,1,"read_schedule [ym1.c]", + "Incorrect read count"); + + if (endian==BIG_ENDIAN) + bswap_int(3,istd); + + ie=0; + ie|=(istd[0]!=(stdint_t)(dtr_log)); + ie|=(istd[1]!=(stdint_t)(dtr_ms)); + ie|=(istd[2]!=(stdint_t)(dtr_cnfg)); + + error_root(ie!=0,1,"read_schedule [ym1.c]", + "Parameters do not match previous run"); + } + else + { + istd[0]=(stdint_t)(dtr_log); + istd[1]=(stdint_t)(dtr_ms); + istd[2]=(stdint_t)(dtr_cnfg); + + if (endian==BIG_ENDIAN) + bswap_int(3,istd); + + iw=fwrite(istd,sizeof(stdint_t),3,fdat); + error_root(iw!=3,1,"read_schedule [ym1.c]", + "Incorrect write count"); + } + } +} + + +static void read_wflow_parms(void) +{ + int nstep,dnms,ie,ir,iw; + stdint_t istd[3]; + double eps,dstd[1]; + + if (my_rank==0) + { + if (append) + { + ir=fread(istd,sizeof(stdint_t),1,fdat); + error_root(ir!=1,1,"read_wflow_parms [ym1.c]", + "Incorrect read count"); + + if (endian==BIG_ENDIAN) + bswap_int(1,istd); + + error_root(istd[0]!=(stdint_t)(noms==0),1,"read_wflow_parms [ym1.c]", + "Attempt to mix measurement with other runs"); + } + else + { + istd[0]=(stdint_t)(noms==0); + + if (endian==BIG_ENDIAN) + bswap_int(1,istd); + + iw=fwrite(istd,sizeof(stdint_t),1,fdat); + error_root(iw!=1,1,"read_wflow_parms [ym1.c]", + "Incorrect write count"); + } + + if (noms==0) + { + find_section("Wilson flow"); + read_line("integrator","%s",line); + read_line("eps","%lf",&eps); + read_line("nstep","%d",&nstep); + read_line("dnms","%d",&dnms); + + if (strcmp(line,"EULER")==0) + flint=0; + else if (strcmp(line,"RK2")==0) + flint=1; + else if (strcmp(line,"RK3")==0) + flint=2; + else + error_root(1,1,"read_wflow_parms [ym1.c]","Unkown integrator"); + + error_root((dnms<1)||(nstep [-noloc] [-noexp] " + "[-rmold] [-noms] [-c [-a [-norng]]]"); + + error_root(endian==UNKNOWN_ENDIAN,1,"read_infile [ym1.c]", + "Machine has unknown endianness"); + + error_root((noexp)&&(noloc),1,"read_infile [ym1.c]", + "The concurrent use of -noloc and -noexp is not permitted"); + + if (scnfg) + { + strncpy(cnfg,argv[scnfg+1],NAME_SIZE-1); + cnfg[NAME_SIZE-1]='\0'; + } + else + cnfg[0]='\0'; + + fin=freopen(argv[ifile+1],"r",stdin); + error_root(fin==NULL,1,"read_infile [ym1.c]", + "Unable to open input file"); + } + + MPI_Bcast(&noloc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&noexp,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&rmold,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&noms,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&scnfg,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&append,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&norng,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&endian,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(cnfg,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + find_section("Random number generator"); + read_line("level","%d",&level); + read_line("seed","%d",&seed); + } + + MPI_Bcast(&level,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&seed,1,MPI_INT,0,MPI_COMM_WORLD); + + read_dirs(); + setup_files(); + + if (my_rank==0) + { + if (append) + fdat=fopen(par_file,"rb"); + else + fdat=fopen(par_file,"wb"); + + error_root(fdat==NULL,1,"read_infile [ym1.c]", + "Unable to open parameter file"); + } + + read_lat_parms(); + read_bc_parms(); + read_hmc_parms(); + read_schedule(); + read_integrator(); + read_wflow_parms(); + + if (my_rank==0) + { + fclose(fin); + fclose(fdat); + + if (append==0) + copy_file(par_file,par_save); + } +} + + +static void check_old_log(int ic,int *nl,int *icnfg) +{ + int ir,isv; + int np[4],bp[4]; + + fend=fopen(log_file,"r"); + error_root(fend==NULL,1,"check_old_log [ym1.c]", + "Unable to open log file"); + (*nl)=0; + (*icnfg)=0; + ir=1; + isv=0; + + while (fgets(line,NAME_SIZE,fend)!=NULL) + { + if (strstr(line,"process grid")!=NULL) + { + ir&=(sscanf(line,"%dx%dx%dx%d process grid, %dx%dx%dx%d", + np,np+1,np+2,np+3,bp,bp+1,bp+2,bp+3)==8); + + ipgrd[0]=((np[0]!=NPROC0)||(np[1]!=NPROC1)|| + (np[2]!=NPROC2)||(np[3]!=NPROC3)); + ipgrd[1]=((bp[0]!=NPROC0_BLK)||(bp[1]!=NPROC1_BLK)|| + (bp[2]!=NPROC2_BLK)||(bp[3]!=NPROC3_BLK)); + } + else if (strstr(line,"Trajectory no")!=NULL) + { + ir&=(sscanf(line,"Trajectory no %d",nl)==1); + isv=0; + } + else if (strstr(line,"Configuration no")!=NULL) + { + ir&=(sscanf(line,"Configuration no %d",icnfg)==1); + isv=1; + } + } + + fclose(fend); + + error_root(ir!=1,1,"check_old_log [ym1.c]","Incorrect read count"); + + error_root(ic!=(*icnfg),1,"check_old_log [ym1.c]", + "Continuation run:\n" + "Initial configuration is not the last one of the previous run"); + + error_root(isv==0,1,"check_old_log [ym1.c]", + "Continuation run:\n" + "The log file extends beyond the last configuration save"); +} + + +static void check_old_dat(int nl) +{ + int nt; + dat_t ndat; + + fdat=fopen(dat_file,"rb"); + error_root(fdat==NULL,1,"check_old_dat [ym1.c]", + "Unable to open data file"); + nt=0; + + while (read_dat(1,&ndat)==1) + nt=ndat.nt; + + fclose(fdat); + + error_root(nt!=nl,1,"check_old_dat [ym1.c]", + "Continuation run: Incomplete or too many data records"); +} + + +static void check_old_msdat(int nl) +{ + int ic,ir,nt,pnt,dnt; + + fdat=fopen(msdat_file,"rb"); + error_root(fdat==NULL,1,"check_old_msdat [ym1.c]", + "Unable to open data file"); + + check_file_head(); + + nt=0; + dnt=0; + pnt=0; + + for (ic=0;;ic++) + { + ir=read_data(); + + if (ir==0) + { + error_root(ic==0,1,"check_old_msdat [ym1.c]", + "No data records found"); + break; + } + + nt=data.nt; + + if (ic==1) + { + dnt=nt-pnt; + error_root(dnt<1,1,"check_old_msdat [ym1.c]", + "Incorrect trajectory separation"); + } + else if (ic>1) + error_root(nt!=(pnt+dnt),1,"check_old_msdat [ym1.c]", + "Trajectory sequence is not equally spaced"); + + pnt=nt; + } + + fclose(fdat); + + error_root((nt!=nl)||((ic>1)&&(dnt!=dtr_ms)),1, + "check_old_msdat [ym1.c]","Last trajectory numbers " + "or the trajectory separations do not match"); +} + + +static void check_files(int *nl,int *icnfg) +{ + int icmax,ic; + + ipgrd[0]=0; + ipgrd[1]=0; + + if (my_rank==0) + { + if (noloc) + error_root(cnfg[strlen(cnfg)-1]=='*',1, + "check_files [ym1.c]","Attempt to read an " + "imported configuration when -noloc is set"); + + if (append) + { + error_root(strstr(cnfg,nbase)!=cnfg,1,"check_files [ym1.c]", + "Continuation run:\n" + "Run name does not match the previous one"); + error_root(sscanf(cnfg+strlen(nbase),"n%d",&ic)!=1,1, + "check_files [ym1.c]","Continuation run:\n" + "Unable to read configuration number from file name"); + + check_old_log(ic,nl,icnfg); + check_old_dat(*nl); + if (noms==0) + check_old_msdat(*nl); + + (*icnfg)+=1; + } + else + { + fin=fopen(log_file,"r"); + fdat=fopen(dat_file,"rb"); + + if (noms==0) + fend=fopen(msdat_file,"rb"); + else + fend=NULL; + + error_root((fin!=NULL)||(fdat!=NULL)||(fend!=NULL),1, + "check_files [ym1.c]", + "Attempt to overwrite old *.log or *.dat file"); + + if (noms==0) + { + fdat=fopen(msdat_file,"wb"); + error_root(fdat==NULL,1,"check_files [ym1.c]", + "Unable to open measurement data file"); + write_file_head(); + fclose(fdat); + } + + (*nl)=0; + (*icnfg)=1; + } + + icmax=(*icnfg)+(ntr-nth)/dtr_cnfg; + + if (noloc==0) + error_root(name_size("%s/%sn%d_%d",loc_dir,nbase,icmax,NPROC-1)>= + NAME_SIZE,1,"check_files [ym1.c]", + "loc_dir name is too long"); + + if (noexp==0) + error_root(name_size("%s/%sn%d",cnfg_dir,nbase,icmax)>=NAME_SIZE,1, + "check_files [ym1.c]","cnfg_dir name is too long"); + + if (scnfg) + { + if (cnfg[strlen(cnfg)-1]=='*') + error_root(name_size("%s/%s%d",loc_dir,cnfg,NPROC-1)>=NAME_SIZE,1, + "check_files [ym1.c]","loc_dir name is too long"); + else + error_root(name_size("%s/%s",cnfg_dir,cnfg)>=NAME_SIZE,1, + "check_files [ym1.c]","cnfg_dir name is too long"); + } + } + + MPI_Bcast(nl,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(icnfg,1,MPI_INT,0,MPI_COMM_WORLD); +} + + +static void init_ud(void) +{ + char *p; + + if (scnfg) + { + if (cnfg[strlen(cnfg)-1]!='*') + { + sprintf(cnfg_file,"%s/%s",cnfg_dir,cnfg); + import_cnfg(cnfg_file); + } + else + { + sprintf(line,"%s/%s",loc_dir,cnfg); + p=line+strlen(line)-1; + p[0]='\0'; + sprintf(cnfg_file,"%s_%d",line,my_rank); + read_cnfg(cnfg_file); + } + } + else + random_ud(); +} + + +static void init_rng(int icnfg) +{ + int ic; + + if (append) + { + if (cnfg[strlen(cnfg)-1]!='*') + { + if (norng) + start_ranlux(level,seed^(icnfg-1)); + else + { + ic=import_ranlux(rng_file); + error_root(ic!=(icnfg-1),1,"init_rng [ym1.c]", + "Configuration number mismatch (*.rng file)"); + } + } + } + else + start_ranlux(level,seed); +} + + +static void store_ud(su3_dble *usv) +{ + su3_dble *udb; + + udb=udfld(); + cm3x3_assign(4*VOLUME,udb,usv); +} + + +static void recall_ud(su3_dble *usv) +{ + su3_dble *udb; + + udb=udfld(); + cm3x3_assign(4*VOLUME,usv,udb); + set_flags(UPDATED_UD); +} + + +static void set_data(int nt) +{ + int in,dn,nn; + double eps; + + data.nt=nt; + dn=file_head.dn; + nn=file_head.nn; + eps=file_head.eps; + + for (in=0;in0); + dn=file_head.dn; + nn=file_head.nn; + eps=file_head.eps; + + din=nn/10; + if (din<1) + din=1; + + printf("Measurement run:\n\n"); + + for (in=0;in<=nn;in+=din) + printf("n = %3d, t = %.2e, Wact = %.6e, Yact = %.6e, Q = % .2e\n", + in*dn,eps*(double)(in*dn),Wact[in],Yact[in],Qtop[in]); + + printf("\n"); + printf("Configuration fully processed in %.2e sec ",wtms); + printf("(average = %.2e sec)\n",wtmsall/(double)(nms)); + printf("Measured data saved\n\n"); + fflush(flog); + } +} + + +static void save_cnfg(int icnfg) +{ + int ie; + + ie=check_bc(0.0)^0x1; + ie|=chs_ubnd(1); + error_root(ie!=0,1,"save_cnfg [ym1.c]","Unexpected boundary values"); + + if (noloc==0) + { + sprintf(cnfg_file,"%s/%sn%d_%d",loc_dir,nbase,icnfg,my_rank); + write_cnfg(cnfg_file); + } + + if (noexp==0) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + export_cnfg(cnfg_file); + } + + if (my_rank==0) + { + if ((noloc==0)&&(noexp==0)) + printf("Configuration no %d saved on the local disks " + "and exported\n\n",icnfg); + else if (noloc==0) + printf("Configuration no %d saved on the local disks\n\n",icnfg); + else if (noexp==0) + printf("Configuration no %d exported\n\n",icnfg); + } +} + + +static void check_endflag(int *iend) +{ + if (my_rank==0) + { + fend=fopen(end_file,"r"); + + if (fend!=NULL) + { + fclose(fend); + remove(end_file); + (*iend)=1; + printf("End flag set, run stopped\n\n"); + } + else + (*iend)=0; + } + + MPI_Bcast(iend,1,MPI_INT,0,MPI_COMM_WORLD); +} + + +static void remove_cnfg(int icnfg) +{ + if ((rmold)&&(icnfg>=1)) + { + if (noloc==0) + { + sprintf(cnfg_file,"%s/%sn%d_%d",loc_dir,nbase,icnfg,my_rank); + remove(cnfg_file); + } + + if ((noexp==0)&&(my_rank==0)) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + remove(cnfg_file); + } + } +} + + +int main(int argc,char *argv[]) +{ + int nl,icnfg; + int n,iend,iac; + double act0[2],act1[2],w0[2],w1[2],npl,siac; + double wt1,wt2,wtcyc,wtall,wtms,wtmsall; + su3_dble **usv; + dat_t ndat; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + read_infile(argc,argv); + if (noms==0) + alloc_data(); + check_files(&nl,&icnfg); + geometry(); + alloc_wud(1); + + if (noms==0) + { + alloc_data(); + + if (flint) + alloc_wfd(1); + } + + print_info(icnfg); + set_mdsteps(); + init_ud(); + init_rng(icnfg); + + if (bc_type()==0) + npl=(double)(6*(N0-1)*N1)*(double)(N2*N3); + else + npl=(double)(6*N0*N1)*(double)(N2*N3); + + iend=0; + siac=0.0; + wtcyc=0.0; + wtall=0.0; + wtms=0.0; + wtmsall=0.0; + + for (n=0;(iend==0)&&(n=nth)&&(((ntr-n-1)%dtr_ms)==0)) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + usv=reserve_wud(1); + store_ud(usv[0]); + set_data(nl+n+1); + recall_ud(usv[0]); + release_wud(); + + MPI_Barrier(MPI_COMM_WORLD); + wt2=MPI_Wtime(); + + wtms=wt2-wt1; + wtmsall+=wtms; + save_msdat(n,wtms,wtmsall); + } + } + + if (((n+1)>=nth)&&(((ntr-n-1)%dtr_cnfg)==0)) + { + save_cnfg(icnfg); + export_ranlux(icnfg,rng_file); + check_endflag(&iend); + error_chk(); + + if (my_rank==0) + { + fflush(flog); + copy_file(log_file,log_save); + copy_file(dat_file,dat_save); + if (noms==0) + copy_file(msdat_file,msdat_save); + copy_file(rng_file,rng_save); + } + + remove_cnfg(icnfg-1); + icnfg+=1; + } + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ym1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ym1.in new file mode 100644 index 0000000000000000000000000000000000000000..82e0c96af78674f86c9781d16477a84354e57c74 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ym1.in @@ -0,0 +1,45 @@ + +[Run name] +name Snoopy137 + +[Directories] +log_dir ../data/ym1/log +dat_dir ../data/ym1/dat +loc_dir /ndata/ym1/cnfg +cnfg_dir /data/ym1/cnfg + +[Lattice parameters] +beta 6.00 +c0 1.6667 + +[Boundary conditions] +type 2 +phi 0.12 -0.56 +phi' 0.92 0.76 +cG 1.10 +cG' 1.05 + +[Random number generator] +level 0 +seed 73099 + +[Trajectory length] +tau 3.0 + +[MD integrator] +integrator OMF4 +lambda 0.19 +nstep 16 + +[MD trajectories] +nth 320 +ntr 32000 +dtr_log 4 +dtr_ms 8 +dtr_cnfg 32 + +[Wilson flow] +integrator RK3 +eps 2.0e-2 +nstep 100 +dnms 10 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..a037d62153c2163b5e1c6ad8d2a7a4f320b6f22a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/INDEX @@ -0,0 +1,68 @@ + +******************************************************************************** + + Module Directories + +******************************************************************************** + +archive Import and export programs for the double-precision + gauge and quark fields. + +block Definition of blocks and block grids. + +dirac Programs for the O(a)-improved Wilson-Dirac operator. + +dfl Deflation subspace generation and deflated SAP+GCR + solver. + +flags Flags and parameter data base. + +forces Molecular-dynamics forces and associated actions. + +lattice Lattice geometry and programs implementing the boundary + conditions in time. + +linalg Generic linear algebra programs for spinor fields, + fields with values in the Lie algebra of SU(3) and + complex scalar fields. + +linsolv Generic Krylov-space solvers. + +little Computation and action of the little Dirac operator + (= restriction of the Wilson-Dirac operator to the + the deflation subspace). + +mdflds Allocation of the fundamental momentum, force + and pseudo-fermion fields. + +nompi Programs used in non-MPI check and analysis programs. + +random Random number generator, gaussian random numbers, + initialization of ranlux. + +ratfcts Rational function data base. + +sap Schwarz alternating procedure and SAP+GCR solver. + +sflds Generic initialization and assignment programs for + spinor fields. + +su3fcts Collection of 3x3 matrix functions. + +sw_term Computation of the Sheikholeslami-Wohlert term. + +tcharge Symmetric field tensor and topological charge. + +uflds Allocation of the fundamental gauge fields. + +update Molecular-dynamics integration, HMC algorithm and + reweighting factors. + +utils Utility programs: aligned allocation, error functions, + endianess functions, functions needed to read input + files, workspace allocation. + +vflds Generic initialization and assignment programs for + for complex scalar fields. + +wflow Integration of the (Wilson) gradient flow. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/archive/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/archive/README new file mode 100644 index 0000000000000000000000000000000000000000..e3bbb506143854b2d27d96c894eed0f025645980 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/archive/README @@ -0,0 +1,69 @@ + +******************************************************************************** + + I/O functions for field configurations + +******************************************************************************** + + +Files +----- + +archive.c Programs to read and write gauge-field configurations. + +sarchive.c Programs to read and write global double-precision spinor + fields. + +Include file +------------ + +The file archive.h defines the prototypes for all externally accessible +functions that are defined in the *.c files listed above. + + +List of functions +----------------- + +void write_cnfg(char *out) + Writes the lattice sizes, the process grid sizes, the coordinates + of the calling process, the state of the random number generators, + the local plaquette sum and the local double-precision gauge field + to the file "out". + +void read_cnfg(char *in) + Reads the local double-precision gauge field from the file "in", + assuming it was written to the file by the program write_cnfg(). + The program then resets the random number generator and checks + that the restored field is compatible with the chosen boundary + conditions. + +void export_cnfg(char *out) + Writes the lattice sizes and the global double-precision gauge + field to the file "out" from process 0 in the universal format + specified below (see the notes). + +void import_cnfg(char *in) + Reads the global double-precision gauge field from the file "in" + on process 0, assuming the field was written to the file in the + universal format. The field is periodically extended if needed + and the program then checks that the configuration is compatible + with the chosen boundary conditions (see the notes). + +void write_sfld(char *out,spinor_dble *sd) + Writes the lattice sizes, the process grid sizes, the coordinates + of the calling process, the square of the norm of the spinor field + sd and the local part of the latter to the file "out". + +void read_sfld(char *in,spinor_dble *sd) + Reads the local part of the spinor field sd from the file "in", + assuming the field was written to the file by write_sfld(). + +void export_sfld(char *out,spinor_dble *sd) + Writes the lattice sizes and the spinor field sd to the file "out" + from process 0 in the universal format specified below (see the + notes). + +void import_sfld(char *in,spinor_dble *sd) + Reads the spinor field sd from the file "in" on process 0, assuming + the field was written to the file in the universal format (see the + notes). diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/archive/archive.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/archive/archive.c new file mode 100644 index 0000000000000000000000000000000000000000..925af77674a10bbbbd903935e06619361632bc7c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/archive/archive.c @@ -0,0 +1,597 @@ + +/******************************************************************************* +* +* File archive.c +* +* Copyright (C) 2005, 2007, 2009-2014 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Programs to read and write gauge-field configurations. +* +* The externally accessible functions are +* +* void write_cnfg(char *out) +* Writes the lattice sizes, the process grid sizes, the coordinates +* of the calling process, the state of the random number generators, +* the local plaquette sum and the local double-precision gauge field +* to the file "out". +* +* void read_cnfg(char *in) +* Reads the local double-precision gauge field from the file "in", +* assuming it was written to the file by the program write_cnfg(). +* The program then resets the random number generator and checks +* that the restored field is compatible with the chosen boundary +* conditions. +* +* void export_cnfg(char *out) +* Writes the lattice sizes and the global double-precision gauge +* field to the file "out" from process 0 in the universal format +* specified below (see the notes). +* +* void import_cnfg(char *in) +* Reads the global double-precision gauge field from the file "in" +* on process 0, assuming the field was written to the file in the +* universal format. The field is periodically extended if needed +* and the program then checks that the configuration is compatible +* with the chosen boundary conditions (see the notes). +* +* Notes: +* +* The program export_cnfg() first writes the lattice sizes and the average +* of the plaquette Re(tr{U(p)}) to the output file. Then follow the 8 link +* variables in the directions +0,-0,...,+3,-3 at the first odd point, the +* second odd point, and so on. The order of the point (x0,x1,x2,x3) with +* Cartesian coordinates in the range 0<=x0 +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "archive.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int endian,ns,nd,*state=NULL; +static su3_dble *ubuf=NULL,*vbuf,*udb; + + +static void alloc_state(void) +{ + int n; + + ns=rlxs_size(); + nd=rlxd_size(); + + if (nseps); + set_bc(); + plaq1=plaq_sum_dble(0)/nplaq; + ie|=(fabs(plaq1-plaq0)>eps); + error_loc(ie!=0,1,"read_cnfg [archive.c]", + "Incorrect average plaquette"); + error_chk(); +} + + +static void check_machine(void) +{ + error_root(sizeof(stdint_t)!=4,1,"check_machine [archive.c]", + "Size of a stdint_t integer is not 4"); + error_root(sizeof(double)!=8,1,"check_machine [archive.c]", + "Size of a double is not 8"); + + endian=endianness(); + error_root(endian==UNKNOWN_ENDIAN,1,"check_machine [archive.c]", + "Unkown endianness"); +} + + +static void alloc_ubuf(int my_rank) +{ + if (my_rank==0) + { + ubuf=amalloc(4*(L3+N3)*sizeof(su3_dble),ALIGN); + vbuf=ubuf+4*L3; + } + else + { + ubuf=amalloc(4*L3*sizeof(su3_dble),ALIGN); + vbuf=NULL; + } + + error(ubuf==NULL,1,"alloc_ubuf [archive.c]", + "Unable to allocate auxiliary array"); +} + + +static void get_links(int iy) +{ + int y3,ifc; + su3_dble *u,*v; + + v=ubuf; + iy*=L3; + + if (ipt[iy]<(VOLUME/2)) + iy+=1; + + for (y3=0;y30) + { + if (my_rank==0) + { + MPI_Send(&dmy,1,MPI_INT,n,tag0,MPI_COMM_WORLD); + MPI_Recv(ubuf,4*L3*18,MPI_DOUBLE,n,tag1,MPI_COMM_WORLD,&stat); + } + else if (my_rank==n) + { + MPI_Recv(&dmy,1,MPI_INT,0,tag0,MPI_COMM_WORLD,&stat); + MPI_Send(ubuf,4*L3*18,MPI_DOUBLE,0,tag1,MPI_COMM_WORLD); + } + } + + if (my_rank==0) + { + if (endian==BIG_ENDIAN) + bswap_double(4*L3*18,ubuf); + iw=fwrite(ubuf,sizeof(su3_dble),4*L3,fout); + iwa|=(iw!=(4*L3)); + } + } + } + + if (my_rank==0) + { + error_root(iwa!=0,1,"export_cnfg [archive.c]", + "Incorrect write count"); + fclose(fout); + } +} + + +void import_cnfg(char *in) +{ + int my_rank,np[4],ir,ie; + int ira,dmy,tag0,tag1; + int n0,n1,n2,n3,nc0,nc1,nc2,nc3; + int x0,x1,x2,y0,y1,y2,y3,c0,c1,c2,ix,iy,ic; + int n,k,l; + stdint_t lsize[4]; + double nplaq,plaq0,plaq1,eps; + MPI_Status stat; + FILE *fin=NULL; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (ubuf==NULL) + { + check_machine(); + alloc_ubuf(my_rank); + } + + dmy=1; + tag0=mpi_tag(); + tag1=mpi_tag(); + udb=udfld(); + + if (my_rank==0) + { + fin=fopen(in,"rb"); + error_root(fin==NULL,1,"import_cnfg [archive.c]", + "Unable to open input file"); + + ir=fread(lsize,sizeof(stdint_t),4,fin); + ir+=fread(&plaq0,sizeof(double),1,fin); + error_root(ir!=5,1,"import_cnfg [archive.c]","Incorrect read count"); + + if (endian==BIG_ENDIAN) + { + bswap_int(4,lsize); + bswap_double(1,&plaq0); + } + + np[0]=(int)(lsize[0]); + np[1]=(int)(lsize[1]); + np[2]=(int)(lsize[2]); + np[3]=(int)(lsize[3]); + + error_root((np[0]<1)||((N0%np[0])!=0)|| + (np[1]<1)||((N1%np[1])!=0)|| + (np[2]<1)||((N2%np[2])!=0)|| + (np[3]<1)||((N3%np[3])!=0),1,"import_cnfg [archive.c]", + "Unexpected or incompatible lattice sizes"); + + error_root((np[0]!=N0)&&(bc_type()!=3),1,"import_cnfg [archive.c]", + "Periodic extension in time is only possible when\n" + "periodic boundary conditions are chosen"); + } + else + { + np[0]=0; + np[1]=0; + np[2]=0; + np[3]=0; + plaq0=0.0; + } + + MPI_Bcast(np,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&plaq0,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + n0=np[0]; + n1=np[1]; + n2=np[2]; + n3=np[3]; + + nc0=N0/n0; + nc1=N1/n1; + nc2=N2/n2; + nc3=N3/n3; + ira=0; + + for (ix=0;ix<(n0*n1*n2);ix++) + { + x0=ix/(n1*n2); + x1=(ix/n2)%n1; + x2=ix%n2; + + if (my_rank==0) + { + n=4*n3; + ir=fread(vbuf,sizeof(su3_dble),n,fin); + ira|=(ir!=n); + + if (endian==BIG_ENDIAN) + bswap_double(n*18,vbuf); + + for (k=1;k0) + { + if (my_rank==0) + { + MPI_Send(vbuf+4*y3,4*L3*18,MPI_DOUBLE,n,tag1,MPI_COMM_WORLD); + MPI_Recv(&dmy,1,MPI_INT,n,tag0,MPI_COMM_WORLD,&stat); + } + else if (my_rank==n) + { + MPI_Recv(ubuf,4*L3*18,MPI_DOUBLE,0,tag1,MPI_COMM_WORLD,&stat); + MPI_Send(&dmy,1,MPI_INT,0,tag0,MPI_COMM_WORLD); + } + } + else if (my_rank==0) + for (l=0;l<(4*L3);l++) + ubuf[l]=vbuf[4*y3+l]; + + if (my_rank==n) + set_links(iy); + } + } + } + + if (my_rank==0) + { + error_root(ira!=0,1,"import_cnfg [archive.c]","Incorrect read count"); + fclose(fin); + } + + set_flags(UPDATED_UD); + ie=check_bc(64.0*DBL_EPSILON); + error_root(ie!=1,1,"import_cnfg [archive.c]", + "Incompatible boundary conditions"); + + ie=0; + nplaq=(double)(6*N0*N1)*(double)(N2*N3); + eps=sqrt(nplaq)*DBL_EPSILON; + plaq1=plaq_sum_dble(1)/nplaq; + ie|=(fabs(plaq1-plaq0)>eps); + set_bc(); + plaq1=plaq_sum_dble(1)/nplaq; + ie|=(fabs(plaq1-plaq0)>eps); + error_root(ie!=0,1,"import_cnfg [archive.c]", + "Incorrect average plaquette"); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/archive/sarchive.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/archive/sarchive.c new file mode 100644 index 0000000000000000000000000000000000000000..c91a99f3f311216fbae16cbf7c0566f02a64bd9a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/archive/sarchive.c @@ -0,0 +1,450 @@ + +/******************************************************************************* +* +* File sarchive.c +* +* Copyright (C) 2007, 2008, 2011, 2013, 2014 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Programs to read and write global double-precision spinor fields. +* +* The externally accessible functions are +* +* void write_sfld(char *out,spinor_dble *sd) +* Writes the lattice sizes, the process grid sizes, the coordinates +* of the calling process, the square of the norm of the spinor field +* sd and the local part of the latter to the file "out". +* +* void read_sfld(char *in,spinor_dble *sd) +* Reads the local part of the spinor field sd from the file "in", +* assuming the field was written to the file by write_sfld(). +* +* void export_sfld(char *out,spinor_dble *sd) +* Writes the lattice sizes and the spinor field sd to the file "out" +* from process 0 in the universal format specified below (see the +* notes). +* +* void import_sfld(char *in,spinor_dble *sd) +* Reads the spinor field sd from the file "in" on process 0, assuming +* the field was written to the file in the universal format (see the +* notes). +* +* Notes: +* +* The spinor fields are assumed to be global quark fields as described in +* main/README.global. Only their physical components (i.e. the spinors on +* the local lattices) are written and read. +* +* The program export_sfld() first writes the global lattice sizes and the +* square-norm of the spinor field. Then follow the spinors at the first +* lattice point, the second point, and so on, in the order given by the +* index +* +* ix=x3+N3*x2+N2*N3*x1+N1*N2*N3*x0, +* +* where N0,N1,N2,N3 are the (global) lattice sizes and (x0,x1,x2,x3) the +* Cartesian coordinates of the points (0<=x0 +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "lattice.h" +#include "linalg.h" +#include "archive.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int endian; +static spinor_dble *sbuf=NULL; + + +void write_sfld(char *out,spinor_dble *sd) +{ + int ldat[16],iw; + double norm; + FILE *fout=NULL; + + error(sd==NULL,1,"write_sfld [sarchive.c]", + "Attempt to access unallocated memory space"); + error(iup[0][0]==0,1,"write_sfld [sarchive.c]", + "Geometry arrays are not set"); + + fout=fopen(out,"wb"); + error_loc(fout==NULL,1,"write_sfld [sarchive.c]", + "Unable to open output file"); + error_chk(); + + ldat[0]=NPROC0; + ldat[1]=NPROC1; + ldat[2]=NPROC2; + ldat[3]=NPROC3; + + ldat[4]=L0; + ldat[5]=L1; + ldat[6]=L2; + ldat[7]=L3; + + ldat[8]=NPROC0_BLK; + ldat[9]=NPROC1_BLK; + ldat[10]=NPROC2_BLK; + ldat[11]=NPROC3_BLK; + + ldat[12]=cpr[0]; + ldat[13]=cpr[1]; + ldat[14]=cpr[2]; + ldat[15]=cpr[3]; + + iw=fwrite(ldat,sizeof(int),16,fout); + norm=norm_square_dble(VOLUME,0,sd); + iw+=fwrite(&norm,sizeof(double),1,fout); + iw+=fwrite(sd,sizeof(spinor_dble),VOLUME,fout); + + error_loc(iw!=(17+VOLUME),1,"write_sfld [sarchive.c]", + "Incorrect write count"); + error_chk(); + fclose(fout); +} + + +void read_sfld(char *in,spinor_dble *sd) +{ + int ldat[16],ir,ie; + double norm0,norm1,eps; + FILE *fin=NULL; + + error(sd==NULL,1,"read_sfld [sarchive.c]", + "Attempt to access unallocated memory space"); + error(iup[0][0]==0,1,"read_sfld [sarchive.c]", + "Geometry arrays are not set"); + + fin=fopen(in,"rb"); + error_loc(fin==NULL,1,"read_sfld [sarchive.c]", + "Unable to open input file"); + error_chk(); + + ir=fread(ldat,sizeof(int),16,fin); + + ie=0; + ie|=((ldat[0]!=NPROC0)||(ldat[1]!=NPROC1)|| + (ldat[2]!=NPROC2)||(ldat[3]!=NPROC3)); + ie|=((ldat[4]!=L0)||(ldat[5]!=L1)|| + (ldat[6]!=L2)||(ldat[7]!=L3)); + ie|=((ldat[8]!=NPROC0_BLK)||(ldat[9]!=NPROC1_BLK)|| + (ldat[10]!=NPROC2_BLK)||(ldat[11]!=NPROC3_BLK)); + ie|=((ldat[12]!=cpr[0])||(ldat[13]!=cpr[1])|| + (ldat[14]!=cpr[2])||(ldat[15]!=cpr[3])); + error(ie!=0,1,"read_sfld [sarchive.c]","Unexpected lattice data"); + + ir+=fread(&norm0,sizeof(double),1,fin); + ir+=fread(sd,sizeof(spinor_dble),VOLUME,fin); + + error_loc(ir!=(17+VOLUME),1,"read_sfld [sarchive.c]", + "Incorrect read count"); + error_chk(); + fclose(fin); + + norm1=norm_square_dble(VOLUME,0,sd); + eps=sqrt(64.0*(double)(VOLUME))*DBL_EPSILON; + error_loc(fabs(norm1-norm0)>(eps*norm0),1,"read_sfld [sarchive.c]", + "Incorrect square norm"); + error_chk(); +} + + +static void check_machine(void) +{ + error_root(sizeof(stdint_t)!=4,1,"check_machine [sarchive.c]", + "Size of a stdint_t integer is not 4"); + error_root(sizeof(double)!=8,1,"check_machine [sarchive.c]", + "Size of a double is not 8"); + error_root(sizeof(spinor_dble)!=192,1,"check_machine [sarchive.c]", + "The spinor_dble structures are not properly packed"); + + endian=endianness(); + error_root(endian==UNKNOWN_ENDIAN,1,"check_machine [sarchive.c]", + "Unkown endianness"); +} + + +static void alloc_sbuf(void) +{ + error(iup[0][0]==0,1,"alloc_sbuf [sarchive.c]", + "Geometry arrays are not set"); + sbuf=amalloc(L3*sizeof(spinor_dble),ALIGN); + error(sbuf==NULL,1,"alloc_sbuf [sarchive.c]", + "Unable to allocate auxiliary array"); +} + + +static void get_spinors(int iy,spinor_dble *sd) +{ + int y3,iz; + spinor_dble *sb; + + sb=sbuf; + iy*=L3; + + for (y3=0;y30) + { + if (my_rank==0) + { + MPI_Send(&dmy,1,MPI_INT,n,tag0,MPI_COMM_WORLD); + MPI_Recv(sbuf,L3*24,MPI_DOUBLE,n,tag1,MPI_COMM_WORLD,&stat); + } + else if (my_rank==n) + { + MPI_Recv(&dmy,1,MPI_INT,0,tag0,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,L3*24,MPI_DOUBLE,0,tag1,MPI_COMM_WORLD); + } + } + + if (my_rank==0) + { + if (endian==BIG_ENDIAN) + bswap_double(L3*24,(double*)(sbuf)); + iw=fwrite(sbuf,sizeof(spinor_dble),L3,fout); + iwa|=(iw!=L3); + } + } + } + + if (my_rank==0) + { + error_root(iwa!=0,1,"export_sfld [sarchive.c]","Incorrect write count"); + fclose(fout); + } +} + + +void import_sfld(char *in,spinor_dble *sd) +{ + int my_rank,np[4],n,ir; + int ira,dmy,tag0,tag1; + int x0,x1,x2,x3,y0,y1,y2,ix,iy; + stdint_t lsize[4]; + double norm0,norm1,eps; + MPI_Status stat; + FILE *fin=NULL; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (sbuf==NULL) + { + check_machine(); + alloc_sbuf(); + } + + error(sd==NULL,1,"import_sfld [sarchive.c]", + "Attempt to access unallocated memory space"); + + dmy=1; + tag0=mpi_tag(); + tag1=mpi_tag(); + + if (my_rank==0) + { + fin=fopen(in,"rb"); + error_root(fin==NULL,1,"import_sfld [sarchive.c]", + "Unable to open input file"); + + ir=fread(lsize,sizeof(stdint_t),4,fin); + ir+=fread(&norm0,sizeof(double),1,fin); + error_root(ir!=5,1,"import_sfld [sarchive.c]","Incorrect read count"); + + if (endian==BIG_ENDIAN) + { + bswap_int(4,lsize); + bswap_double(1,&norm0); + } + + error_root((lsize[0]!=N0)||(lsize[1]!=N1)||(lsize[2]!=N2)|| + (lsize[3]!=N3),1,"import_sfld [sarchive.c]", + "Lattice sizes do not match"); + } + else + norm0=0.0; + + ira=0; + + for (ix=0;ix<(N0*N1*N2);ix++) + { + x0=ix/(N1*N2); + x1=(ix/N2)%N1; + x2=ix%N2; + + y0=x0%L0; + y1=x1%L1; + y2=x2%L2; + iy=y2+L2*y1+L1*L2*y0; + + np[0]=x0/L0; + np[1]=x1/L1; + np[2]=x2/L2; + + for (x3=0;x30) + { + if (my_rank==0) + { + MPI_Send(sbuf,L3*24,MPI_DOUBLE,n,tag1,MPI_COMM_WORLD); + MPI_Recv(&dmy,1,MPI_INT,n,tag0,MPI_COMM_WORLD,&stat); + } + else if (my_rank==n) + { + MPI_Recv(sbuf,L3*24,MPI_DOUBLE,0,tag1,MPI_COMM_WORLD,&stat); + MPI_Send(&dmy,1,MPI_INT,0,tag0,MPI_COMM_WORLD); + } + } + + if (my_rank==n) + set_spinors(iy,sd); + } + } + + if (my_rank==0) + { + error_root(ira!=0,1,"import_sfld [sarchive.c]","Incorrect read count"); + fclose(fin); + } + + norm1=norm_square_dble(VOLUME,1,sd); + eps=sqrt(64.0*(double)(N0*N1)*(double)(N2*N3))*DBL_EPSILON; + error_root(fabs(norm1-norm0)>(eps*norm0),1,"import_sfld [sarchive.c]", + "Incorrect square norm"); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/README new file mode 100644 index 0000000000000000000000000000000000000000..02ba0ff9102fd40fb73b100cac134579a2291067 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/README @@ -0,0 +1,152 @@ + +******************************************************************************** + + Block allocation and block field initialization programs + +******************************************************************************** + + +Files +----- + +blk_grid.c Block grid allocation. + +block.c Basic allocation programs for blocks of lattice points. + +map_s2blk.c Copying of spinor fields to and from the blocks in a + block grid. + +map_sw2blk.c Copying of the SW fields to the blocks in a block grid. + +map_u2blk.c Copying of the gauge fields to the blocks in a block grid. + + +Include file +------------ + +The file block.h defines the prototypes for all externally accessible +functions that are defined in the *.c files listed above. + +The types block_t, bndry_t and blk_grid_t are also defined there and the +meaning of the entries in these structures is explained in the file +README.block. + + +List of functions +----------------- + +void alloc_bgr(blk_grid_t grid) + Allocates the specified block grid. The block array and the block + fields are put in the static memory of this module and are properly + initialized. + +block_t *blk_list(blk_grid_t grid,int *nb,int *isw) + Returns the pointer to the block array of the specified grid. The + number of blocks on the local lattice is assigned to nb and isw is + set to 0 or 1 depending on whether the first block is black or white + (by definition it is black on the first process). If the block grid + is not allocated, the program returns NULL and sets nb and isw to 0. + +void alloc_blk(block_t *b,int *bo,int *bs, + int iu,int iud,int ns,int nsd) + Sets the offset and side-lengths of the block b to bo[4] and bs[4], + respectively, and allocates the block fields depending on the values + of the other parameters. The single-precision gauge and SW fields are + allocated if iu=1, the double-precision gauge and SW fields if iud=1, + while ns and nsd are the numbers of single- and double-precision Dirac + fields that are allocated. All elements of the block are properly + initialized and the share flag b.shf is set to 0x0 (see the notes). + +void alloc_bnd(block_t *b,int iu,int iud,int nw,int nwd) + Allocates the boundary structures b.bb in the block b and the fields + in there depending on the parameters iu,iud,nw and nwd. The single- + and double-precision gauge fields are allocated if iu=1 and iud=1, + respectively, while nw and nwd are the numbers of single- and double- + precision Weyl fields that are allocated. All elements of the block + are then properly initialized (see the notes). + +void clone_blk(block_t *b,int shf,int *bo,block_t *c) + Sets the offset of the block c to bo[4] and its side lengths to + b.bs[4]. The fields in c are then allocated depending on the bits + b1,b2,..,b8 (counting from the lowest) of the share flag shf. The + relevant bits are: + + b2=1: b.ipt,b.iup and b.idn are shared, + b3=1: b.u, b.bb.u and b.sw are shared, + b4=1: b.ud, b.bb.ud and b.swd are shared, + b5=1: b.s is shared, + b6=1: b.sd is shared. + b7=1: b.bb.w is shared, + b8=1: b.bb.wd is shared. + + All fields that are not shared and are allocated on b are allocated + on c as well, while the pointers to the shared fields are set to those + of b. An error occurs if a field is shared according to the share flag + b.shf on b but not according to shf. Moreover, the offset differences + bo[mu]-b.bo[mu] must be integer multiples of b.bs[mu] for all mu. The + share flag c.shf is set to shf. + +void free_blk(block_t *b) + Frees the arrays in the block b and in the boundaries b.bb that were + previously allocated by alloc_blk(), alloc_bnd() or clone_blk(). The + boundary structures are then freed too (if they were allocated) and + all entries in the block structure are set to 0 (or NULL). + +int ipt_blk(block_t *b,int *x) + Returns the index of the lattice point in the block b with Cartesian + coordinates x[4] relative to the base point of b. + +void assign_s2sblk(blk_grid_t grid,int n,ptset_t set,spinor *s,int k) + Assigns the relevant part of the global single-precision spinor field s + to the single-precision field b.s[k] on the n'th block of the specified + block grid. Depending on the specified point set, the field on the even, + odd or all points is copied. + +void assign_sblk2s(blk_grid_t grid,int n,ptset_t set,int k,spinor *s) + Assigns the single-precision spinor field b.s[k] on the n'th block of + the specified block grid to the relevant part of the global single- + precision field s. Depending on specified point set, the field on the + even, odd or all points is copied. + +void assign_s2sdblk(blk_grid_t grid,int n,ptset_t set,spinor *s,int k) + Assigns the relevant part of the global single-precision spinor field s + to the double-precision field b.sd[k] on the n'th block of the specified + block grid. Depending on the specified point set, the field on the even, + odd or all points is copied. + +void assign_sd2sdblk(blk_grid_t grid,int n,ptset_t set, + spinor_dble *sd,int k) + Assigns the relevant part of the global double-precision spinor field sd + to the double-precision field b.sd[k] on the n'th block of the specified + block grid. Depending on the specified point set, the field on the even, + odd or all points is copied. + +void assign_sdblk2sd(blk_grid_t grid,int n,ptset_t set, + int k,spinor_dble *sd) + Assigns the single-precision spinor field b.sd[k] on the n'th block of + the specified block grid to the relevant part of the global single- + precision field sd. Depending on specified point set, the field on the + even, odd or all points is copied. + +int assign_swd2swbgr(blk_grid_t grid,ptset_t set) + Assigns the global double-precision SW field to the corresponding + single-precision fields in the specified grid. On the given point + set, the copied Pauli matrices are inverted before assignment and + the program returns 0 or 1 depending on whether the inversions were + safe or not. + +int assign_swd2swdblk(blk_grid_t grid,int n,ptset_t set) + Assigns the global double-precision SW field to the corresponding + double-precision field on the n'th block of the specified grid. On + the given point set, the copied Pauli matrices are inverted before + assignment and the program returns 0 or 1 depending on whether the + inversions were safe or not. + +void assign_ud2ubgr(blk_grid_t grid) + Assigns the global double-precision gauge field to the corresponding + single-precision fields in the specified block grid (see the notes). + +void assign_ud2udblk(blk_grid_t grid,int n) + Assigns the global double-precision gauge field to the corresponding + double-precision field on the n'th block of the specified block grid + (see the notes). diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/README.block b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/README.block new file mode 100644 index 0000000000000000000000000000000000000000..316ca7a1b95cc1fb4c2d3cce3a316fb757d3e7b8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/README.block @@ -0,0 +1,235 @@ + +******************************************************************************** + + Block structure explained + +******************************************************************************** + +Rectangular blocks of lattice points and their exterior boundaries are +described by the structures block_t and bndry_t that are defined in the +header file block.h. These objects can be easily handled by passing the +corresponding pointers to the functions that operate on them. + +It is currently not possible to allocate blocks that are not fully +contained in the local lattice. Moreover, the block sizes must be even +and not smaller than 4. The exterior boundaries of a block may, +however, overlap with the lattices on the neighbouring processes. + + +Block structure +--------------- + +Block data and fields are collected in a structure + +typedef struct +{ + int *bo,*bs,vol,vbb,nbp,ns,nsd,shf; + int *ipt,*imb,*ibp; + int (*iup)[4],(*idn)[4]; + su3 *u; + su3_dble *ud; + pauli *sw; + pauli_dble *swd; + spinor **s; + spinor_dble **sd; + bndry_t *bb; +} block_t; + +block_t b; + +with the following entries: + +b.bo[4] Cartesian coordinates (in the local lattice) of the + base point of the block. + +b.bs[4] Linear sizes of the block in the four dimensions. The + local coordinates in direction mu of the points in the + block thus range from b.bo[mu] to b.bo[mu]+b.bs[mu]-1 + inclusive. + +b.vol Number of points in the block. + +b.vbb Total number of exterior boundary points of the + block. + +b.nbp Number of points in the block at global time 0 + (boundary conditions type 0,1 or 2)and and time + NPROC0*L0-1 (boundary conditions type 0). + +b.ns Number of single-precision spinor fields on the block. + +b.nsd Number of double-precision spinor fields on the block. + +b.shf The bits b1,b2,...,b7 in this number (counting from + the lowest) indicate that + + b1=1: The block is protected, + b2=1: The geometry arrays are shared, + b3=1: b.u, bb.u and b.sw are shared, + b4=1: b.ud, bb.ud and b.swd are shared, + b5=1: b.s is shared, + b6=1: b.sd is shared, + b7=1: bb.w is shared, + b8=1: bb.wd is shared + + (the last two bits refer to the Weyl fields on the + exterior boundaries of the block). As explained below, + block fields can be shared among the blocks in a block + grid. Protected blocks cannot be freed or reallocated. + +b.ipt[b.vol+1] The block points are labeled by an index ix. If + x0,x1,x2,x3 are the coordinates of a block point + relative to the base point, a primitive point label + is iy=x3+b.bs[3]*x2+...+b.bs[3]*b.bs[2]*b.bs[1]*x0. + This array returns the actual label ix=b.ipt[iy] + (the last entry in the array is not used). + +b.imb[b.vol+1] For a given block point with label ix, this array + returns the label iz=b.imb[ix] of the point in the + local lattice. The array thus defines the embedding + of the block in full lattice (the last entry in the + array is not used). + +b.ibp[b.nbp] Array of the labels ix of the block points at global + time 0 (boundary conditions type 0,1 or 2) and time + NPROC0*L0-1 (boundary conditions type 0). The labels + are in ascending order. In particular, the first and + second half of the array contain the labels of the + even and odd points, respectively. + +b.iup[b.vol][4] Block geometry arrays, giving the labels of the +b.idn[b.vol][4] neighbours of a given block point. If the neighbour + is on the exterior boundary of the block, the arrays + return the value b.vol. + +b.u[4*b.vol] The single-precision gauge field on the block is + stored in this array in such a way that the 8 link + variables at the odd point with label ix are the 8 + elements at b.u+8*(ix-b.vol/2) (as on the global + lattice). The links sticking out of the block are + special in the sense that the variables residing + there are not used. + +b.ud[4*b.vol] This array contains the double-precision gauge field + on the block. The storage conventions are the same as + in the case of the single-precision field. + +b.sw[2*b.vol] The single-precision SW term is allocated together + with the single-precision gauge field. The upper + and lower Pauli matrix at the point with label ix + are stored at b.sw[2*ix] and b.sw[2*ix+1]. + +b.swd[2*b.vol] The double-precision SW term is allocated together + with the double-precision gauge field. The upper + and lower Pauli matrix at the point with label ix + are stored at b.swd[2*ix] and b.swd[2*ix+1]. + +b.s[b.ns][b.vol+1] The value of the k'th single-precision spinor field + at the block point with label ix is b.s[k][ix]. In + each of these fields the last entry is not used. + +b.sd[b.nsd][b.vol+1] The value of the k'th double-precision spinor field + at the block point with label ix is b.sd[k][ix]. In + each of these fields the last entry is not used. + +b.bb[8] Array of boundary structures, one for each face + (see below). + +In general, not all field arrays are allocated. Some blocks may contain the +tsingle-precision gauge and SW fields but not the double-precision fields, +for example. Which fields are allocated and which are shared can be chosen +when the block is allocated (see alloc_blk() [block.c]). + +The phrase "... is not used" refers to an array element that serves as +a place-holder or for another technical purpose. At the beginning of any +subprogram, the variables stored there will, in general, contain random +values. + + +Boundary structure +------------------ + +The geometry of each face of the exterior boundary of a block and the +fields living there are described by a structure + +typedef struct +{ + int ifc,ibn,vol,nw,nwd; + int *ipp,*map,*imb; + su3 *u; + su3_dble *ud; + weyl **w; + weyl_dble **wd; +} bndry_t; + +bndry_t bb; + +with the following entries: + +bb.ifc The faces in the -0,+0,-1,+1,-2,+2,-3,+3 directions + are labeled by a number ifc ranging from 0 to 7. + +bb.ibn Indicates whether the face is contained in the + exterior boundary of the local lattice (bb.ibn=1) + or not (bb.ibn=0). + +bb.vol Number of points in the face. + +bb.nw Number of single-precision Weyl fields on the face. + +bb.nwd Number of double-precision Weyl fields on the face. + +bb.ipp[bb.vol+1] The points in the face are labeled by an index ix. + Each point has a unique nearest point on the block + (its "partner point") with label bb.ipp[ix] (the + last entry in the array is not used). + +bb.map[bb.vol+1] For a given face point with label ix, bb.map[ix] is + the label of the partner point of the corresponding + point on the opposite face of the block (the last + entry in the array is not used). + +bb.imb[bb.vol+1] For a given face point with label ix, bb.imb[ix] + is the label of the point in the local lattice + (or in its exterior boundary; see README.global). + The array thus defines the embedding of the face + in full lattice (the last entry in the array is + not used). + +bb.u[bb.vol] Array of the single-precision gauge-field variables + residing on the links that connect the face points + with their partner points on the block. + +bb.ud[bb.vol] Array of the double-precision gauge-field variables + residing on the links that connect the face points + with their partner points on the block. + +bb.w[bb.nw][bb.vol] The value of the k'th single-precision Weyl field + at the face point with label ix is bb.w[k][ix]. + +bb.wd[bb.nwd][bb.vol] The value of the k'th double-precision Weyl field + at the face point with label ix is bb.wd[k][ix]. + +Which field arrays are allocated may be chosen when the program alloc_bnd() +[block.c] is called. + + + +Block grids +----------- + +Grids of blocks that cover the whole lattice without overlaps can be +allocated and initialized using the programs in the module blk_grid.c. +The enumeration type blk_grid_t (see block.h) lists the currently +available block grids. + +The size of the blocks in a block grid can be chosen when the grid is +allocated, but is required to divide the local lattice. Moreover, the +number of blocks in the local lattice must be even and the total +number of blocks in any space-time direction must also be even. + +Among the blocks of a block grid, the gauge, Dirac spinor and Weyl +fields may be shared, i.e. they are allocated only on the first block +in the local lattice and their addresses are copied to the other +blocks. Which fields are shared can be determined by reading the bits +of the flag b.shf on any one of the blocks b. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/blk_grid.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/blk_grid.c new file mode 100644 index 0000000000000000000000000000000000000000..203622a7729d1d33b852400ab1bcbf47a8d4ec96 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/blk_grid.c @@ -0,0 +1,214 @@ + +/******************************************************************************* +* +* File blk_grid.c +* +* Copyright (C) 2005, 2007, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Block grid allocation. +* +* The externally accessible functions are +* +* void alloc_bgr(blk_grid_t grid) +* Allocates the specified block grid. The block array and the block +* fields are put in the static memory of this module and are properly +* initialized. +* +* block_t *blk_list(blk_grid_t grid,int *nb,int *isw) +* Returns the pointer to the block array of the specified grid. The +* number of blocks on the local lattice is assigned to nb and isw is +* set to 0 or 1 depending on whether the first block is black or white +* (by definition it is black on the first process). If the block grid +* is not allocated, the program returns NULL and sets nb and isw to 0. +* +* Notes: +* +* The block sizes bs[4] and other parameters of the specified block grid +* are obtained from the parameter data base. These and the lattice sizes +* must be such that the lattice can be covered by non-overlapping blocks. +* Moreover, the number of blocks in each direction must be even and the +* local lattices must contain an even number of blocks. This ensures that +* the block grid can be chessboard-coloured and that the number of blocks +* in the local lattice is the same for both colours. +* +* On all processes, the blocks at a given position in the array of blocks +* returned by blk_list() have the same position in the local lattice. The +* blocks are ordered such that the first half of them have the same colour. +* For a given colour, the blocks are ordered according to their index +* +* n[3]+nbl[3]*n[2]+nbl[2]*nbl[3]*n[1]+nbl[1]*nbl[2]*nbl[3]*n[0], +* +* where n[mu]=bo[mu]/bs[mu] are the Cartesian coordinates of the block in +* the block grid and nbl[mu] denotes the numbers of blocks in direction mu. +* All blocks have allocated boundaries and the protection flag set. +* +* The program alloc_bgr() involves communications and must be called on all +* processes simultaneously with the same parameters. A given block grid can +* be allocated only once. +* +*******************************************************************************/ + +#define BLK_GRID_C + +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "sap.h" +#include "block.h" +#include "global.h" + +typedef struct +{ + int nb,isw; + block_t *b; +} bgrid_t; + +static bgrid_t bgr[(int)(BLK_GRIDS)+1]={{0,0,NULL},{0,0,NULL},{0,0,NULL}}; + + +static block_t *blks(int *bs,int iu,int iud,int ns,int nsd, + int iub,int iudb,int nw,int nwd, + int shf,int *nb,int *isw) +{ + int bo[4]; + int n0,n1,n2,n3,m0,m1,m2,m3; + block_t *b,*rbe,*rbo; + + n0=L0/bs[0]; + n1=L1/bs[1]; + n2=L2/bs[2]; + n3=L3/bs[3]; + + (*nb)=n0*n1*n2*n3; + (*isw)=(cpr[0]*n0+cpr[1]*n1+cpr[2]*n2+cpr[3]*n3)&0x1; + + b=malloc((*nb)*sizeof(*b)); + error(b==NULL,1,"blks [blk_grid.c]","Unable to allocate block grid"); + + rbe=b; + rbo=b+(*nb)/2; + + for (m0=0;m01) + { + iprms[0]=igr; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + + error(iprms[0]!=igr,1,"alloc_bgr [blk_grid.c]", + "Parameter is not global"); + } + + error(bgr[igr].b!=NULL,1,"alloc_bgr [blk_grid.c]", + "Block grid is already allocated"); + + bs=NULL; + iu=0; + iud=0; + ns=0; + nsd=0; + iub=0; + iudb=0; + nw=0; + nwd=0; + shf=0x0; + + if (grid==SAP_BLOCKS) + { + sap=sap_parms(); + error_root(sap.ncy==0,1,"alloc_bgr [blk_grid.c]", + "SAP parameters are not set"); + + bs=sap.bs; + iu=1; + ns=3; + iub=1; + shf=0x13; + } + else if (grid==DFL_BLOCKS) + { + dfl=dfl_parms(); + error_root(dfl.Ns==0,1,"alloc_bgr [blk_grid.c]", + "Deflation subspace parameters are not set"); + + bs=dfl.bs; + iud=1; + ns=dfl.Ns+1; + nsd=dfl.Ns+1; + shf=0xb; + } + else + error_root(1,1,"alloc_bgr [blk_grid.c]","Unknown block grid"); + + bgr[igr].b=blks(bs,iu,iud,ns,nsd,iub,iudb,nw,nwd,shf, + &(bgr[igr].nb),&(bgr[grid].isw)); + + if (grid==SAP_BLOCKS) + alloc_sap_bufs(); +} + + +block_t *blk_list(blk_grid_t grid,int *nb,int *isw) +{ + int igr; + + igr=(int)(grid); + (*nb)=bgr[igr].nb; + (*isw)=bgr[igr].isw; + + return bgr[igr].b; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/block.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/block.c new file mode 100644 index 0000000000000000000000000000000000000000..ba5d82b97a28fc6665fc4ce813a3cc562a3b6408 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/block.c @@ -0,0 +1,940 @@ + +/******************************************************************************* +* +* File block.c +* +* Copyright (C) 2005, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Basic allocation programs for blocks of lattice points. +* +* The externally accessible functions are +* +* void alloc_blk(block_t *b,int *bo,int *bs, +* int iu,int iud,int ns,int nsd) +* Sets the offset and side-lengths of the block b to bo[4] and bs[4], +* respectively, and allocates the block fields depending on the values +* of the other parameters. The single-precision gauge and SW fields are +* allocated if iu=1, the double-precision gauge and SW fields if iud=1, +* while ns and nsd are the numbers of single- and double-precision Dirac +* fields that are allocated. All elements of the block are properly +* initialized and the share flag b.shf is set to 0x0 (see the notes). +* +* void alloc_bnd(block_t *b,int iu,int iud,int nw,int nwd) +* Allocates the boundary structures b.bb in the block b and the fields +* in there depending on the parameters iu,iud,nw and nwd. The single- +* and double-precision gauge fields are allocated if iu=1 and iud=1, +* respectively, while nw and nwd are the numbers of single- and double- +* precision Weyl fields that are allocated. All elements of the block +* are then properly initialized (see the notes). +* +* void clone_blk(block_t *b,int shf,int *bo,block_t *c) +* Sets the offset of the block c to bo[4] and its side lengths to +* b.bs[4]. The fields in c are then allocated depending on the bits +* b1,b2,..,b8 (counting from the lowest) of the share flag shf. The +* relevant bits are: +* +* b2=1: b.ipt,b.iup and b.idn are shared, +* b3=1: b.u, b.bb.u and b.sw are shared, +* b4=1: b.ud, b.bb.ud and b.swd are shared, +* b5=1: b.s is shared, +* b6=1: b.sd is shared. +* b7=1: b.bb.w is shared, +* b8=1: b.bb.wd is shared. +* +* All fields that are not shared and are allocated on b are allocated +* on c as well, while the pointers to the shared fields are set to those +* of b. An error occurs if a field is shared according to the share flag +* b.shf on b but not according to shf. Moreover, the offset differences +* bo[mu]-b.bo[mu] must be integer multiples of b.bs[mu] for all mu. The +* share flag c.shf is set to shf. +* +* void free_blk(block_t *b) +* Frees the arrays in the block b and in the boundaries b.bb that were +* previously allocated by alloc_blk(), alloc_bnd() or clone_blk(). The +* boundary structures are then freed too (if they were allocated) and +* all entries in the block structure are set to 0 (or NULL). +* +* int ipt_blk(block_t *b,int *x) +* Returns the index of the lattice point in the block b with Cartesian +* coordinates x[4] relative to the base point of b. +* +* Notes: +* +* The entries of the block and boundary structures are explained in the file +* README.block in this directory. +* +* It is currently not possible to allocate blocks that are not fully +* contained in the local lattice. Moreover, the block sizes must be even +* and not smaller than 4. The exterior boundaries of a block may, however, +* overlap with the lattices on the neighbouring processes. In all cases, +* the scalar elements of the structures and the geometry and field arrays +* are properly initialized (gauge and SW fields are set to 1, Dirac spinor +* and Weyl fields to 0). +* +* Block allocation is a global operation, i.e. alloc_blk(), alloc_bnd(), +* clone_blk() and free_blk() must be called on all processes simultaneously. +* The program ipt_blk() can be called locally. +* +* alloc_blk() and clone_blk() register the blocks as being allocated. In this +* way it is possible to exclude any misuses of the programs such as freeing +* an unallocated block (which could have unpredictable side-effects). An +* already allocated block is first freed and then reallocated by alloc_blk(). +* Blocks b and their boundary structures b.bb cannot be freed or reallocated +* if the lowest bit of the share flag b.shf is equal to 1. +* +*******************************************************************************/ + +#define BLOCK_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "block.h" +#include "global.h" + +static const su3 u0={{0.0f}}; +static const su3_dble ud0={{0.0}}; +static const pauli p0={{0.0f}}; +static const pauli_dble pd0={{0.0}}; +static const weyl w0={{{0.0f}}}; +static const weyl_dble wd0={{{0.0}}}; + +struct ablk_t +{ + block_t *b; + struct ablk_t *next; +}; + +static struct ablk_t *first=NULL; + + +static int ins_blk(block_t *b) +{ + struct ablk_t *p; + + p=malloc(sizeof(*p)); + + if (p!=NULL) + { + (*p).b=b; + (*p).next=first; + first=p; + + return 0; + } + else + return 1; +} + + +static void rmv_blk(block_t *b) +{ + struct ablk_t *p,*q; + + q=NULL; + + for (p=first;p!=NULL;p=(*p).next) + { + if ((*p).b==b) + { + if (q==NULL) + first=(*p).next; + else + (*q).next=(*p).next; + + free(p); + return; + } + + q=p; + } +} + + +static int fnd_blk(block_t *b) +{ + struct ablk_t *p; + + for (p=first;p!=NULL;p=(*p).next) + { + if ((*p).b==b) + return 1; + } + + return 0; +} + + +static void free_bnd(block_t *b) +{ + int shf; + bndry_t *bb; + + shf=(*b).shf; + bb=(*b).bb; + + if (bb==NULL) + return; + + if (!(shf&0x2)) + free((*bb).ipp); + + free((*bb).imb); + + if ((!(shf&0x4))&&((*bb).u!=NULL)) + afree((*bb).u); + + if ((!(shf&0x8))&&((*bb).ud!=NULL)) + afree((*bb).ud); + + if ((!(shf&0x40))&&((*bb).nw>0)) + { + afree((*bb).w[0]); + free((*bb).w); + } + + if ((!(shf&0x80))&&((*bb).nwd>0)) + { + afree((*bb).wd[0]); + free((*bb).wd); + } + + free((*b).bb); + (*b).bb=NULL; +} + + +void free_blk(block_t *b) +{ + int shf; + + if (fnd_blk(b)==0) + return; + + shf=(*b).shf; + error(shf&0x1,1,"free_blk [block.c]", + "Protected block"); + + free_bnd(b); + + free((*b).bo); + (*b).bo=NULL; + (*b).bs=NULL; + + if (!(shf&0x2)) + { + free((*b).ipt); + free((*b).iup); + } + + free((*b).imb); + + (*b).vol=0; + (*b).vbb=0; + (*b).nbp=0; + (*b).shf=0x0; + (*b).ipt=NULL; + (*b).imb=NULL; + (*b).ibp=NULL; + (*b).iup=NULL; + (*b).idn=NULL; + + if ((!(shf&0x4))&&((*b).u!=NULL)) + { + afree((*b).u); + afree((*b).sw); + } + + if ((!(shf&0x8))&&((*b).ud!=NULL)) + { + afree((*b).ud); + afree((*b).swd); + } + + if ((!(shf&0x10))&&((*b).ns>0)) + { + afree((*b).s[0]); + free((*b).s); + } + + if ((!(shf&0x20))&&((*b).nsd>0)) + { + afree((*b).sd[0]); + free((*b).sd); + } + + (*b).ns=0; + (*b).nsd=0; + (*b).u=NULL; + (*b).ud=NULL; + (*b).sw=NULL; + (*b).swd=NULL; + (*b).s=NULL; + (*b).sd=NULL; + + rmv_blk(b); +} + + +static void set_u2unity(int vol,su3 *u) +{ + su3 unity,*um; + + unity=u0; + unity.c11.re=1.0f; + unity.c22.re=1.0f; + unity.c33.re=1.0f; + + um=u+vol; + + for (;uL0)||(bs[0]<4)||((bs[0]%2)!=0)|| + (bo[1]<0)||((bo[1]+bs[1])>L1)||(bs[1]<4)||((bs[1]%2)!=0)|| + (bo[2]<0)||((bo[2]+bs[2])>L2)||(bs[2]<4)||((bs[2]%2)!=0)|| + (bo[3]<0)||((bo[3]+bs[3])>L3)||(bs[3]<4)||((bs[3]%2)!=0),1, + "new_blk [block.c]","Improper choice of block position or size"); + + error_root((ns<0)||(nsd<0),1,"new_blk [block.c]", + "Improper choice of the numbers of spinor fields"); + + (*b).bo=malloc(8*sizeof(*(*b).bo)); + error((*b).bo==NULL,1,"new_blk [block.c]", + "Unable to allocate size arrays"); + (*b).bs=(*b).bo+4; + + for (mu=0;mu<4;mu++) + { + (*b).bo[mu]=bo[mu]; + (*b).bs[mu]=bs[mu]; + } + + (*b).vol=bs[0]*bs[1]*bs[2]*bs[3]; + (*b).vbb=2*(bs[0]*bs[1]*bs[2]+bs[1]*bs[2]*bs[3]+ + bs[2]*bs[3]*bs[0]+bs[3]*bs[0]*bs[1]); + (*b).nbp=0; + + if ((cpr[0]==0)&&(bo[0]==0)&&(bc_type()!=3)) + (*b).nbp+=bs[1]*bs[2]*bs[3]; + if ((cpr[0]==(NPROC0-1))&&((bo[0]+bs[0])==L0)&&(bc_type()==0)) + (*b).nbp+=bs[1]*bs[2]*bs[3]; + + (*b).ns=ns; + (*b).nsd=nsd; + (*b).shf=shf; + + if (shf&0x2) + { + (*b).ipt=NULL; + (*b).iup=NULL; + (*b).idn=NULL; + } + else + { + (*b).ipt=malloc(((*b).vol+1)*sizeof(*(*b).ipt)); + (*b).iup=malloc(2*(*b).vol*sizeof(*(*b).iup)); + error(((*b).ipt==NULL)||((*b).iup==NULL),1, + "new_blk [block.c]","Unable to allocate the geometry arrays"); + (*b).idn=(*b).iup+(*b).vol; + } + + (*b).imb=malloc((((*b).vol+1)+(*b).nbp)*sizeof(*(*b).imb)); + (*b).ibp=(*b).imb+(*b).vol+1; + + if ((shf&0x4)||(iu!=1)) + { + (*b).u=NULL; + (*b).sw=NULL; + } + else + { + (*b).u=amalloc(4*(*b).vol*sizeof(*(*b).u),ALIGN); + (*b).sw=amalloc(2*(*b).vol*sizeof(*(*b).sw),ALIGN); + error(((*b).u==NULL)||((*b).sw==NULL),1,"new_blk [block.c]", + "Unable to allocate the single-precision gauge field"); + set_u2unity(4*(*b).vol,(*b).u); + set_sw2unity(2*(*b).vol,(*b).sw); + } + + if ((shf&0x8)||(iud!=1)) + { + (*b).ud=NULL; + (*b).swd=NULL; + } + else + { + (*b).ud=amalloc(4*(*b).vol*sizeof(*(*b).ud),ALIGN); + (*b).swd=amalloc(2*(*b).vol*sizeof(*(*b).swd),ALIGN); + error(((*b).ud==NULL)||((*b).swd==NULL),1,"new_blk [block.c]", + "Unable to allocate the double-precision gauge field"); + set_ud2unity(4*(*b).vol,(*b).ud); + set_swd2unity(2*(*b).vol,(*b).swd); + } + + if ((shf&0x10)||(ns==0)) + (*b).s=NULL; + else + { + (*b).s=malloc(ns*sizeof(*(*b).s)); + error((*b).s==NULL,1,"new_blk [block.c]", + "Unable to allocate the single-precision spinor fields"); + + (*b).s[0]=amalloc(ns*((*b).vol+1)*sizeof(*((*b).s[0])),ALIGN); + error((*b).s[0]==NULL,2,"new_blk [block.c]", + "Unable to allocate the single-precision spinor fields"); + + for (n=1;n1) + { + for (mu=0;mu<4;mu++) + { + iprms[mu]=bo[mu]; + iprms[4+mu]=bs[mu]; + } + + iprms[8]=iu; + iprms[9]=iud; + iprms[10]=ns; + iprms[11]=nsd; + + MPI_Bcast(iprms,12,MPI_INT,0,MPI_COMM_WORLD); + + ie=0; + + for (mu=0;mu<4;mu++) + if ((iprms[mu]!=bo[mu])||(iprms[4+mu]!=bs[mu])) + ie=1; + + error((ie)||(iprms[8]!=iu)||(iprms[9]!=iud)|| + (iprms[10]!=ns)||(iprms[11]!=nsd),1,"alloc_blk [block.c]", + "Parameters are not global"); + } + + error(iup[0][0]==0,1,"alloc_blk [block.c]", + "The global geometry arrays are not set"); + + new_blk(b,bo,bs,iu,iud,ns,nsd,0x0); + blk_geometry(b); + blk_imbed(b); +} + + +static void new_bnd(block_t *b,int iu,int iud,int nw,int nwd,int shf) +{ + int vol,ifc,n; + int *bs,*ipp,*map,*imb; + su3 *u; + su3_dble *ud; + weyl **w,*wb; + weyl_dble **wd,*wdb; + bndry_t *bb; + + error_root((nw<0)||(nwd<0),1,"new_bnd [block.c]", + "Improper choice of the numbers of Weyl fields"); + + free_bnd(b); + bb=malloc(8*sizeof(*bb)); + error(bb==NULL,1,"new_bnd [block.c]", + "Unable to allocate boundary structures"); + (*b).bb=bb; + + vol=(*b).vol; + bs=(*b).bs; + + for (ifc=0;ifc<8;ifc++) + { + bb[ifc].ifc=ifc; + bb[ifc].vol=vol/bs[ifc/2]; + bb[ifc].nw=nw; + bb[ifc].nwd=nwd; + } + + vol=(*b).vbb; + + if (shf&0x2) + { + for (ifc=0;ifc<8;ifc++) + { + bb[ifc].ipp=NULL; + bb[ifc].map=NULL; + } + } + else + { + ipp=malloc(2*(vol+8)*sizeof(*ipp)); + error(ipp==NULL,1,"new_bnd [block.c]", + "Unable to allocate the geometry arrays"); + map=ipp+vol+8; + + for (ifc=0;ifc<8;ifc++) + { + bb[ifc].ipp=ipp; + ipp+=(bb[ifc].vol+1); + bb[ifc].map=map; + map+=(bb[ifc].vol+1); + } + } + + imb=malloc((vol+8)*sizeof(*imb)); + error(imb==NULL,2,"new_bnd [block.c]", + "Unable to allocate the geometry arrays"); + + for (ifc=0;ifc<8;ifc++) + { + bb[ifc].imb=imb; + imb+=(bb[ifc].vol+1); + } + + if ((shf&0x4)||(iu!=1)) + { + for (ifc=0;ifc<8;ifc++) + bb[ifc].u=NULL; + } + else + { + u=amalloc(vol*sizeof(*u),ALIGN); + error(u==NULL,1,"new_bnd [block.c]", + "Unable to allocate the single-precision gauge field"); + set_u2unity(vol,u); + + for (ifc=0;ifc<8;ifc++) + { + bb[ifc].u=u; + u+=bb[ifc].vol; + } + } + + if ((shf&0x8)||(iud!=1)) + { + for (ifc=0;ifc<8;ifc++) + bb[ifc].ud=NULL; + } + else + { + ud=amalloc(vol*sizeof(*ud),ALIGN); + error(ud==NULL,1,"new_bnd [block.c]", + "Unable to allocate the double-precision gauge field"); + set_ud2unity(vol,ud); + + for (ifc=0;ifc<8;ifc++) + { + bb[ifc].ud=ud; + ud+=bb[ifc].vol; + } + } + + if ((shf&0x40)||(nw==0)) + { + for (ifc=0;ifc<8;ifc++) + bb[ifc].w=NULL; + } + else + { + w=malloc(8*nw*sizeof(*w)); + wb=amalloc(nw*vol*sizeof(*wb),ALIGN); + error((w==NULL)||(wb==NULL),1,"new_bnd [block.c]", + "Unable to allocate the single-precision Weyl fields"); + set_w2zero(nw*vol,wb); + + for (ifc=0;ifc<8;ifc++) + { + bb[ifc].w=w; + + for (n=0;n1) + { + bo=(*b).bo; + bs=(*b).bs; + + for (mu=0;mu<4;mu++) + { + iprms[mu]=bo[mu]; + iprms[4+mu]=bs[mu]; + } + + iprms[8]=iu; + iprms[9]=iud; + iprms[10]=nw; + iprms[11]=nwd; + + MPI_Bcast(iprms,12,MPI_INT,0,MPI_COMM_WORLD); + + ie=0; + + for (mu=0;mu<4;mu++) + if ((iprms[mu]!=bo[mu])||(iprms[4+mu]!=bs[mu])) + ie=1; + + error((ie)||(iprms[8]!=iu)||(iprms[9]!=iud)|| + (iprms[10]!=nw)||(iprms[11]!=nwd),1,"alloc_bnd [block.c]", + "Parameters are not global"); + } + + new_bnd(b,iu,iud,nw,nwd,0x0); + bnd_geometry(b); + bnd_imbed(b); +} + + +void clone_blk(block_t *b,int shf,int *bo,block_t *c) +{ + int iprms[23],mu,ie; + int *bbo,*bs,bshf; + int iu,iud,ns,nsd,iub,iudb,nw,nwd; + int ib,ifc; + + error(fnd_blk(b)==0,1,"clone_blk [block.c]", + "The block to be cloned is not allocated"); + + bbo=(*b).bo; + bs=(*b).bs; + bshf=(*b).shf; + iu=((*b).u!=NULL); + iud=((*b).ud!=NULL); + ns=(*b).ns; + nsd=(*b).nsd; + + if ((*b).bb!=NULL) + { + iub=((*b).bb[0].u!=NULL); + iudb=((*b).bb[0].ud!=NULL); + nw=(*b).bb[0].nw; + nwd=(*b).bb[0].nwd; + ib=1; + } + else + { + iub=0; + iudb=0; + nw=0; + nwd=0; + ib=0; + } + + if (NPROC>1) + { + for (mu=0;mu<4;mu++) + { + iprms[mu]=bbo[mu]; + iprms[4+mu]=bs[mu]; + iprms[8+mu]=bo[mu]; + } + + iprms[12]=bshf; + iprms[13]=iu; + iprms[14]=iud; + iprms[15]=ns; + iprms[16]=nsd; + iprms[17]=iub; + iprms[18]=iudb; + iprms[19]=nw; + iprms[20]=nwd; + iprms[21]=ib; + iprms[22]=shf; + + MPI_Bcast(iprms,23,MPI_INT,0,MPI_COMM_WORLD); + + ie=0; + + for (mu=0;mu<4;mu++) + { + if ((iprms[mu]!=bbo[mu])|| + (iprms[4+mu]!=bs[mu])|| + (iprms[8+mu]!=bo[mu])) + ie=1; + } + + error((ie)||(iprms[12]!=bshf)||(iprms[13]!=iu)||(iprms[14]!=iud)|| + (iprms[15]!=ns)||(iprms[16]!=nsd)||(iprms[17]!=iub)|| + (iprms[18]!=iudb)||(iprms[19]!=nw)||(iprms[20]!=nwd)|| + (iprms[21]!=ib)||(iprms[22]!=shf),1,"clone_blk [block.c]", + "Parameters are not global"); + } + + error_root((bo[0]<0)||((bo[0]+bs[0])>L0)||((abs(bo[0]-bbo[0])%bs[0])!=0)|| + (bo[1]<0)||((bo[1]+bs[1])>L1)||((abs(bo[1]-bbo[1])%bs[1])!=0)|| + (bo[2]<0)||((bo[2]+bs[2])>L2)||((abs(bo[2]-bbo[2])%bs[2])!=0)|| + (bo[3]<0)||((bo[3]+bs[3])>L3)||((abs(bo[3]-bbo[3])%bs[3])!=0),1, + "clone_blk [block.c]","Improper block offset"); + + error_root(((bshf&0x2)&&(!(shf&0x2)))|| + ((bshf&0x4)&&(!(shf&0x4))&&(iu!=0))|| + ((bshf&0x8)&&(!(shf&0x8))&&(iud!=0))|| + ((bshf&0x10)&&(!(shf&0x10))&&(ns>0))|| + ((bshf&0x20)&&(!(shf&0x20))&&(nsd>0)),1, + "clone_blk [block.c]","Share flag mismatch"); + + new_blk(c,bo,bs,iu,iud,ns,nsd,shf); + + if (shf&0x2) + { + (*c).ipt=(*b).ipt; + (*c).iup=(*b).iup; + (*c).idn=(*b).idn; + } + + if ((shf&0x4)&&(iu!=0)) + { + (*c).u=(*b).u; + (*c).sw=(*b).sw; + } + + if ((shf&0x8)&&(iud!=0)) + { + (*c).ud=(*b).ud; + (*c).swd=(*b).swd; + } + + if ((shf&0x10)&&(ns>0)) + (*c).s=(*b).s; + + if ((shf&0x20)&&(nsd>0)) + (*c).sd=(*b).sd; + + if (!(shf&0x2)) + blk_geometry(c); + blk_imbed(c); + + if (ib) + { + error_root(((bshf&0x4)&&(!(shf&0x4))&&(iub!=0))|| + ((bshf&0x8)&&(!(shf&0x8))&&(iudb!=0))|| + ((bshf&0x40)&&(!(shf&0x40))&&(nw>0))|| + ((bshf&0x80)&&(!(shf&0x80))&&(nwd>0)),2, + "clone_blk [block.c]","Share flag mismatch"); + + new_bnd(c,iub,iudb,nw,nwd,shf); + + for (ifc=0;ifc<8;ifc++) + { + if (shf&0x2) + { + (*c).bb[ifc].ipp=(*b).bb[ifc].ipp; + (*c).bb[ifc].map=(*b).bb[ifc].map; + } + + if ((shf&0x4)&&(iub!=0)) + (*c).bb[ifc].u=(*b).bb[ifc].u; + + if ((shf&0x8)&&(iudb!=0)) + (*c).bb[ifc].ud=(*b).bb[ifc].ud; + + if ((shf&0x40)&&(nw>0)) + (*c).bb[ifc].w=(*b).bb[ifc].w; + + if ((shf&0x80)&&(nwd>0)) + (*c).bb[ifc].wd=(*b).bb[ifc].wd; + } + + if (!(shf&0x2)) + bnd_geometry(c); + bnd_imbed(c); + } +} + + +int ipt_blk(block_t *b,int *x) +{ + int *bs,n,ix; + + bs=(*b).bs; + + n=((x[0]<0)||(x[0]>=bs[0])); + ix=x[0]; + + n|=((x[1]<0)||(x[1]>=bs[1])); + ix=x[1]+bs[1]*ix; + + n|=((x[2]<0)||(x[2]>=bs[2])); + ix=x[2]+bs[2]*ix; + + n|=((x[3]<0)||(x[3]>=bs[3])); + ix=x[3]+bs[3]*ix; + + if (n==0) + return (*b).ipt[ix]; + else + { + error_loc(1,1,"ipt_blk [block.c]","Point coordinates are out of range"); + return 0; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/map_s2blk.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/map_s2blk.c new file mode 100644 index 0000000000000000000000000000000000000000..ca421c68574728e30d63d4570f7d8982ba7ad269 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/map_s2blk.c @@ -0,0 +1,890 @@ + +/******************************************************************************* +* +* File map_s2blk.c +* +* Copyright (C) 2005, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Copying of the spinor fields to and from the blocks in a block grid +* +* The externally accessible functions are +* +* void assign_s2sblk(blk_grid_t grid,int n,ptset_t set,spinor *s,int k) +* Assigns the relevant part of the global single-precision spinor field s +* to the single-precision field b.s[k] on the n'th block of the specified +* block grid. Depending on the specified point set, the field on the even, +* odd or all points is copied. +* +* void assign_sblk2s(blk_grid_t grid,int n,ptset_t set,int k,spinor *s) +* Assigns the single-precision spinor field b.s[k] on the n'th block of +* the specified block grid to the relevant part of the global single- +* precision field s. Depending on the specified point set, the field on +* the even, odd or all points is copied. +* +* void assign_s2sdblk(blk_grid_t grid,int n,ptset_t set,spinor *s,int k) +* Assigns the relevant part of the global single-precision spinor field s +* to the double-precision field b.sd[k] on the n'th block of the specified +* block grid. Depending on the specified point set, the field on the even, +* odd or all points is copied. +* +* void assign_sd2sdblk(blk_grid_t grid,int n,ptset_t set, +* spinor_dble *sd,int k) +* Assigns the relevant part of the global double-precision spinor field sd +* to the double-precision field b.sd[k] on the n'th block of the specified +* block grid. Depending on the specified point set, the field on the even, +* odd or all points is copied. +* +* void assign_sdblk2sd(blk_grid_t grid,int n,ptset_t set, +* int k,spinor_dble *sd) +* Assigns the single-precision spinor field b.sd[k] on the n'th block of +* the specified block grid to the relevant part of the global single- +* precision field sd. Depending on the specified point set, the field on +* the even, odd or all points is copied. +* +* Notes: +* +* Only the spinors residing on the blocks (but not those on the boundaries +* of the blocks) are copied. All these programs can be called locally. +* +*******************************************************************************/ + +#define MAP_S2BLK_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "block.h" +#include "global.h" + +#if (defined x64) +#include "sse2.h" + +void assign_s2sblk(blk_grid_t grid,int n,ptset_t set,spinor *s,int k) +{ + int nb,isw,vol,*imb; + spinor *sb,*sm,*rs1,*rs2; + block_t *b; + + b=blk_list(grid,&nb,&isw)+n; + + if ((n<0)||(n>=nb)) + { + error_loc(1,1,"assign_s2sblk [map_s2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(k>=(*b).ns)) + { + error_loc(1,1,"assign_s2sblk [map_s2blk.c]", + "Block field number is out of range"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + sb=(*b).s[k]; + sm=sb; + + if (set==ALL_PTS) + sm+=vol; + else if (set==EVEN_PTS) + sm+=vol/2; + else if (set==ODD_PTS) + { + imb+=vol/2; + sb+=vol/2; + sm+=vol; + } + + rs2=s+(*imb); + + for (;sb=nb)) + { + error_loc(1,1,"assign_sblk2s [map_s2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(k>=(*b).ns)) + { + error_loc(1,1,"assign_sblk2s [map_s2blk.c]", + "Block field number is out of range"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + sb=(*b).s[k]; + sm=sb; + + if (set==ALL_PTS) + sm+=vol; + else if (set==EVEN_PTS) + sm+=vol/2; + else if (set==ODD_PTS) + { + imb+=vol/2; + sb+=vol/2; + sm+=vol; + } + + for (;sb=nb)) + { + error_loc(1,1,"assign_s2sdblk [map_s2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(k>=(*b).nsd)) + { + error_loc(1,1,"assign_s2sdblk [map_s2blk.c]", + "Block field number is out of range"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + sb=(*b).sd[k]; + sm=sb; + + if (set==ALL_PTS) + sm+=vol; + else if (set==EVEN_PTS) + sm+=vol/2; + else if (set==ODD_PTS) + { + imb+=vol/2; + sb+=vol/2; + sm+=vol; + } + + rs2=s+(*imb); + + for (;sb=nb)) + { + error_loc(1,1,"assign_sd2sdblk [map_s2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(k>=(*b).nsd)) + { + error_loc(1,1,"assign_sd2sdblk [map_s2blk.c]", + "Block field number is out of range"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + sb=(*b).sd[k]; + sm=sb; + + if (set==ALL_PTS) + sm+=vol; + else if (set==EVEN_PTS) + sm+=vol/2; + else if (set==ODD_PTS) + { + imb+=vol/2; + sb+=vol/2; + sm+=vol; + } + + rs2=sd+(*imb); + + for (;sb=nb)) + { + error_loc(1,1,"assign_sdblk2sd [map_s2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(k>=(*b).nsd)) + { + error_loc(1,1,"assign_sdblk2sd [map_s2blk.c]", + "Block field number is out of range"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + sb=(*b).sd[k]; + sm=sb; + + if (set==ALL_PTS) + sm+=vol; + else if (set==EVEN_PTS) + sm+=vol/2; + else if (set==ODD_PTS) + { + imb+=vol/2; + sb+=vol/2; + sm+=vol; + } + + for (;sb=nb)) + { + error_loc(1,1,"assign_s2sblk [map_s2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(k>=(*b).ns)) + { + error_loc(1,1,"assign_s2sblk [map_s2blk.c]", + "Block field number is out of range"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + sb=(*b).s[k]; + sm=sb; + + if (set==ALL_PTS) + sm+=vol; + else if (set==EVEN_PTS) + sm+=vol/2; + else if (set==ODD_PTS) + { + imb+=vol/2; + sb+=vol/2; + sm+=vol; + } + + for (;sb=nb)) + { + error_loc(1,1,"assign_sblk2s [map_s2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(k>=(*b).ns)) + { + error_loc(1,1,"assign_sblk2s [map_s2blk.c]", + "Block field number is out of range"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + sb=(*b).s[k]; + sm=sb; + + if (set==ALL_PTS) + sm+=vol; + else if (set==EVEN_PTS) + sm+=vol/2; + else if (set==ODD_PTS) + { + imb+=vol/2; + sb+=vol/2; + sm+=vol; + } + + for (;sb=nb)) + { + error_loc(1,1,"assign_s2sdblk [map_s2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(k>=(*b).nsd)) + { + error_loc(1,1,"assign_s2sdblk [map_s2blk.c]", + "Block field number is out of range"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + sb=(*b).sd[k]; + sm=sb; + + if (set==ALL_PTS) + sm+=vol; + else if (set==EVEN_PTS) + sm+=vol/2; + else if (set==ODD_PTS) + { + imb+=vol/2; + sb+=vol/2; + sm+=vol; + } + + for (;sb=nb)) + { + error_loc(1,1,"assign_sd2sdblk [map_s2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(k>=(*b).nsd)) + { + error_loc(1,1,"assign_sd2sdblk [map_s2blk.c]", + "Block field number is out of range"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + sb=(*b).sd[k]; + sm=sb; + + if (set==ALL_PTS) + sm+=vol; + else if (set==EVEN_PTS) + sm+=vol/2; + else if (set==ODD_PTS) + { + imb+=vol/2; + sb+=vol/2; + sm+=vol; + } + + for (;sb=nb)) + { + error_loc(1,1,"assign_sdblk2sd [map_s2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(k>=(*b).nsd)) + { + error_loc(1,1,"assign_sdblk2sd [map_s2blk.c]", + "Block field number is out of range"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + sb=(*b).sd[k]; + sm=sb; + + if (set==ALL_PTS) + sm+=vol; + else if (set==EVEN_PTS) + sm+=vol/2; + else if (set==ODD_PTS) + { + imb+=vol/2; + sb+=vol/2; + sm+=vol; + } + + for (;sb +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "sw_term.h" +#include "block.h" +#include "global.h" + +pauli_dble m[2] ALIGNED16; + + +static int cp_swd2sw(block_t *b,ptset_t set) +{ + int *imb,ifail; + pauli *pb,*pm; + pauli_dble *swd,*p; + + swd=swdfld(); + pb=(*b).sw; + pm=pb+(*b).vol; + imb=(*b).imb; + ifail=0; + + for (;pb1) + { + iprms[0]=(int)(grid); + iprms[1]=(int)(set); + + MPI_Bcast(iprms,2,MPI_INT,0,MPI_COMM_WORLD); + + error((iprms[0]!=(int)(grid))||(iprms[1]!=(int)(set)),1, + "assign_swd2swbgr [map_sw2blk.c]","Parameters are not global"); + } + + b=blk_list(grid,&nb,&isw); + + if (nb==0) + { + error_root(1,1,"assign_swd2swbgr [map_sw2blk.c]", + "Block grid is not allocated"); + return 0; + } + + if (((*b).sw==NULL)||((*b).shf&0x4)) + { + error_root(1,1,"assign_swd2swbgr [map_sw2blk.c]", + "SW field on the grid is not allocated or shared"); + return 0; + } + + ie=query_flags(SWD_E_INVERTED); + io=query_flags(SWD_O_INVERTED); + + error_root(((ie)&&((set==ALL_PTS)||(set==EVEN_PTS)))|| + ((io)&&((set==ALL_PTS)||(set==ODD_PTS))),1, + "assign_swd2swbgr [map_sw2blk.c]", + "Attempt to invert the SW field a second time"); + + bm=b+nb; + ifail=0; + + for (;b=nb)) + { + error_loc(1,1,"assign_swd2swdblk [map_sw2blk.c]", + "Block grid is not allocated or block number out of range"); + return 0; + } + + if (((*b).swd==NULL)||(!((*b).shf&0x8))) + { + error_loc(1,1,"assign_swd2swdblk [map_sw2blk.c]", + "Block field is not allocated or not shared"); + return 0; + } + + ie=query_flags(SWD_E_INVERTED); + io=query_flags(SWD_O_INVERTED); + + if (((ie)&&((set==ALL_PTS)||(set==EVEN_PTS)))|| + ((io)&&((set==ALL_PTS)||(set==ODD_PTS)))) + { + error_loc(1,1,"assign_swd2swdblk [map_sw2blk.c]", + "Attempt to invert the SW field a second time"); + return 0; + } + + return cp_swd2swd(b+n,set); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/map_u2blk.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/map_u2blk.c new file mode 100644 index 0000000000000000000000000000000000000000..9fb40cc023f7e42e5c5fd4c860e5e7d41331b161 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/map_u2blk.c @@ -0,0 +1,381 @@ + +/******************************************************************************* +* +* File map_u2blk.c +* +* Copyright (C) 2006, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Copying of the gauge fields to the blocks in a block grid. +* +* The externally accessible functions are +* +* void assign_ud2ubgr(blk_grid_t grid) +* Assigns the global double-precision gauge field to the corresponding +* single-precision fields in the specified block grid (see the notes). +* +* void assign_ud2udblk(blk_grid_t grid,int n) +* Assigns the global double-precision gauge field to the corresponding +* double-precision field on the n'th block of the specified block grid +* (see the notes). +* +* Notes: +* +* The program assign_ud2ubgr() copies the gauge field to all blocks and their +* exterior boundaries (if the field is allocated there). An error occurs if +* the single-precision gauge field on the blocks is shared. On the exterior +* block boundaries at time 0 (boundary conditions type 0,1 and 2) and time +* NPROC0*L0-1 (boundary condition type 0), the link variables are not copied +* and are instead set to zero. +* +* The program assign_ud2udblk() does *not* copy the link variables to the +* boundaries of the block. The double-precision gauge field on the blocks +* must be shared in this case. +* +* As explained in README.block, the field arrays on the blocks reserve space +* for all 8 link variables at the odd points, including those on the links +* that "stick out" of the block. While the latter are used for technical +* purposes only, the programs in this module copy these too. +* +* Both programs in this module may involve communications and must be called +* on all MPI processes simultaneously. +* +*******************************************************************************/ + +#define MAP_U2BLK_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "block.h" +#include "global.h" + +static int bc,np,nmu[8],nbf[8],ofs[8]; +static int sflg[8],rflg[8],tags[8],init=0; +static const su3 u0={{0.0f}}; +static su3 *ubuf; + + +static void alloc_ubuf(void) +{ + int ifc,ib; + + error(iup[0][0]==0,1,"alloc_ubuf [map_u2blk.c]", + "Geometry arrays are not set"); + + bc=bc_type(); + np=(cpr[0]+cpr[1]+cpr[2]+cpr[3])&0x1; + + nbf[0]=FACE0/2; + nbf[1]=FACE0/2; + nbf[2]=FACE1/2; + nbf[3]=FACE1/2; + nbf[4]=FACE2/2; + nbf[5]=FACE2/2; + nbf[6]=FACE3/2; + nbf[7]=FACE3/2; + + ofs[0]=0; + + for (ifc=0;ifc<8;ifc++) + { + nmu[ifc]=cpr[ifc/2]&0x1; + + if (ifc>0) + ofs[ifc]=ofs[ifc-1]+nbf[ifc-1]; + + sflg[ifc]=((ifc>1)|| + ((ifc==0)&&((cpr[0]!=0)||(bc!=0)))|| + ((ifc==1)&&((cpr[0]!=(NPROC0-1))||(bc==3)))); + + rflg[ifc]=((ifc>1)|| + ((ifc==0)&&((cpr[0]!=0)||(bc==3)))|| + ((ifc==1)&&((cpr[0]!=(NPROC0-1))||(bc!=0)))); + + tags[ifc]=mpi_permanent_tag(); + } + + if (BNDRY>0) + { + ubuf=amalloc(BNDRY*sizeof(*ubuf),ALIGN); + error(ubuf==NULL,1,"alloc_ubuf [map_u2blk.c]", + "Unable to allocate communication buffer"); + + for (ib=0;ib0) + { + io=(ifc^nmu[ifc])^0x1; + + sbuf=sbuf0+ofs[io^0x1]; + rbuf=rbuf0+ofs[io]; + saddr=npr[io]; + raddr=saddr; + + n=18*nbf[ifc]; + tag=tags[ifc]; + + if (np==0) + { + if (sflg[io]) + MPI_Send(sbuf,n,MPI_FLOAT,saddr,tag,MPI_COMM_WORLD); + if (rflg[io]) + MPI_Recv(rbuf,n,MPI_FLOAT,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + if (rflg[io]) + MPI_Recv(rbuf,n,MPI_FLOAT,raddr,tag,MPI_COMM_WORLD,&stat); + if (sflg[io]) + MPI_Send(sbuf,n,MPI_FLOAT,saddr,tag,MPI_COMM_WORLD); + } + } + } +} + + +static void assign_ud2ub(block_t *b) +{ + int vol,volb,ifc,ibd,ibu; + int ix,iy,*imb,*ipp,*imbb; + su3 *u,*ub; + su3_dble *udb,*vd; + bndry_t *bb; + + vol=(*b).vol; + imb=(*b).imb; + + udb=udfld(); + u=(*b).u; + + for (ix=(vol/2);ix1)||((ifc==0)&&(ibd==0))||((ifc==1)&&(ibu==0))) + { + ipp=(*bb).ipp; + + for (ix=0;ix<(volb/2);ix++) + { + iy=ipp[ix]; + (*u)=ub[8*(iy-(vol/2))+(ifc^0x1)]; + u+=1; + } + + imbb=(*bb).imb; + + for (;ix1) + { + iprms[0]=(int)(grid); + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + + error(iprms[0]!=(int)(grid),1,"assign_u2ubgr [map_u2blk.c]", + "Parameter is not global"); + } + + if (init==0) + alloc_ubuf(); + + b=blk_list(grid,&nb,&isw); + + error((b==NULL)||((*b).u==NULL)||((*b).shf&0x4),1, + "assign_u2ubgr [map_u2blk.c]","Unallocated or improper block grid"); + + if (NPROC>1) + { + fetch_bnd_u(); + send_bnd_u(); + } + + bm=b+nb; + + for (;b=nb)) + { + error_loc(1,1,"assign_ud2udblk [map_u2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + b+=n; + + if (((*b).ud==NULL)||(((*b).shf&0x8)==0)) + { + error_loc(1,1,"assign_ud2udblk [map_u2blk.c]", + "Block field is not allocated or not shared"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + ud=(*b).ud; + udb=udfld(); + + for (ix=(vol/2);ix=0, the deflation + subspace is regenerated by calling dfl_modes(). The solver program + dfl_sap_gcr() is then called again and the results are passed to + the calling program. + +void dfl_sd2vd(spinor_dble *sd,complex_dble *vd) + Assigns the components of the global double-precision spinor field + sd along the deflation subspace to the double-precision vector + field vd. + +void dfl_vd2sd(complex_dble *vd,spinor_dble *sd) + Assigns the element of the deflation subspace corresponding to the + double-precision vector field vd to the global double-precision spinor + field sd. + +void dfl_sub_vd2sd(complex_dble *vd,spinor_dble *sd) + Subtracts the element of the deflation subspace corresponding to the + double-precision vector field vd from the global double-precision + spinor field sd. + +void dfl_s2v(spinor *s,complex *v) + Assigns the components of the global single-precision spinor field + s along the deflation subspace to the single-precision vector + field v. + +void dfl_v2s(complex *v,spinor *s) + Assigns the element of the deflation subspace corresponding to the + single-precision vector field v to the global single-precision spinor + field s. + +void dfl_sub_v2s(complex *v,spinor *s) + Subtracts the element of the deflation subspace corresponding to the + double-precision vector field v from the global single-precision spinor + field s. + +void dfl_subspace(spinor **mds) + Copies the global single-precision spinor fields mds[0],..,mds[Ns-1] + to the fields b.sd[1],..,b.sd[Ns] on the blocks b of the DFL_BLOCKS + grid. The block fields are then orthonormalized and are assigned to + the single-precision block fields b.s[1],..,b.s[Ns]. + In this basis of fields, the modes mds[0],..,mds[Ns-1] are given by + fields vmds[0],..,vmds[Ns-1] of Ns*nb complex numbers, where nb is + the number of blocks in the block grid. These fields are assigned to + the last Ns single-precision vector fields of the array returned by + vflds() [vflds/vflds.c]. + +double ltl_gcr(int nkv,int nmx,double res,double mu, + complex_dble *eta,complex_dble *psi,int *status) + Obtains an approximate solution psi of the little Dirac equation for + given source eta using the even-odd preconditioned GCR algorithm. See + the notes for the explanation of the parameters of the program. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/dfl/dfl_geometry.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/dfl/dfl_geometry.c new file mode 100644 index 0000000000000000000000000000000000000000..74b9a05f6f4d95e21eb3a6434928ac280ac3c53d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/dfl/dfl_geometry.c @@ -0,0 +1,406 @@ + +/******************************************************************************* +* +* File dfl_geometry.c +* +* Copyright (C) 2007, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Geometry of the DFL_BLOCKS block grid. +* +* The externally accessible functions are +* +* dfl_grid_t dfl_geometry(void) +* Returns a structure containing the index arrays that describe the +* geometry of the DFL_BLOCKS block grid (see the notes). +* +* Notes: +* +* The blocks in the DFL_BLOCKS grid form a hypercubic lattice whose geometry +* is described by a structure of type dfl_grid_t. The elements of this +* structure are: +* +* nb Number of blocks in the local lattice. +* +* nbb Number of exterior boundary blocks of the local +* block lattice. +* +* inn[ix][ifc] Index of the nearest neighbour block in direction ifc +* of the block with index ix (ix=0,..,nb-1, ifc=0,..,7). +* The ordering of the directions ifc is -0,+0,..,-3,+3. +* +* idx[ix] Position of the block with index ix in the array of +* blocks returned by blk_list(). Note that ix=idx[ib] +* if ib=idx[ix]. +* +* ipp[ix] Index of the nearest neighbour (partner block) in the +* local lattice of the block on the exterior boundary +* with index nb+ix (ix=0,..,nbb-1). +* +* map[ix] Index of the partner block on the opposite face of the +* local lattice of the block on the exterior boundary +* with index nb+ix (ix=0,..,nbb-1). +* +* nbbe[ifc] Number of even (odd) blocks on the exterior boundary +* nbbo[ifc] in direction ifc. +* +* obbe[ifc] Offset of the index of the first even (odd) block on +* obbo[ifc] the exterior boundary in direction ifc. The offsets +* are given relative to the first block on the boundary. +* +* The blocks in the local lattice are ordered according to their Cartesian +* coordinates (n0,n1,n2,n3) in the total block lattice. First come all even +* blocks (those with (n0+n1+n2+n3)=0 mod 2) and then the odd ones. Within +* each of these two groups of blocks, the ordering is lexicographic, i.e. +* the block with coordinates n comes before the block with coordinates m if +* +* (n0 +#include +#include +#include "flags.h" +#include "utils.h" +#include "dfl.h" +#include "global.h" + +static int isw,init=0; +static int nbl[4],nbb[4]; +static dfl_grid_t dfl_grid; + + +static void set_grid_sizes(void) +{ + int mu,ifc; + int *bs,*nbbe,*nbbo,*obbe,*obbo; + dfl_parms_t dfl; + + dfl=dfl_parms(); + bs=dfl.bs; + + error_root(dfl.Ns==0,1,"set_grid_sizes [dfl_geometry.c]", + "Deflation subspace parameters are not set"); + + nbl[0]=L0/bs[0]; + nbl[1]=L1/bs[1]; + nbl[2]=L2/bs[2]; + nbl[3]=L3/bs[3]; + + nbb[0]=(NPROC0>1)*nbl[1]*nbl[2]*nbl[3]; + nbb[1]=(NPROC1>1)*nbl[2]*nbl[3]*nbl[0]; + nbb[2]=(NPROC2>1)*nbl[3]*nbl[0]*nbl[1]; + nbb[3]=(NPROC3>1)*nbl[0]*nbl[1]*nbl[2]; + + isw=(nbl[0]*cpr[0]+nbl[1]*cpr[1]+ + nbl[2]*cpr[2]+nbl[3]*cpr[3])&0x1; + + dfl_grid.nb=nbl[0]*nbl[1]*nbl[2]*nbl[3]; + dfl_grid.nbb=2*(nbb[0]+nbb[1]+nbb[2]+nbb[3]); + + nbbe=dfl_grid.nbbe; + nbbo=dfl_grid.nbbo; + obbe=dfl_grid.obbe; + obbo=dfl_grid.obbo; + + for (mu=0;mu<4;mu++) + { + if (isw) + { + nbbe[2*mu]=(nbb[mu]+1)/2; + nbbo[2*mu]=nbb[mu]-nbbe[2*mu]; + } + else + { + nbbo[2*mu]=(nbb[mu]+1)/2; + nbbe[2*mu]=nbb[mu]-nbbo[2*mu]; + } + + if (nbl[mu]&0x1) + { + nbbe[2*mu+1]=nbbe[2*mu]; + nbbo[2*mu+1]=nbbo[2*mu]; + } + else + { + nbbe[2*mu+1]=nbbo[2*mu]; + nbbo[2*mu+1]=nbbe[2*mu]; + } + } + + obbe[0]=0; + + for (ifc=1;ifc<8;ifc++) + obbe[ifc]=obbe[ifc-1]+nbbe[ifc-1]; + + obbo[0]=obbe[7]+nbbe[7]; + + for (ifc=1;ifc<8;ifc++) + obbo[ifc]=obbo[ifc-1]+nbbo[ifc-1]; +} + + +static void alloc_arrays(void) +{ + int nb,nbb; + int (*inn)[8],*idx; + + nb=dfl_grid.nb; + nbb=dfl_grid.nbb; + inn=malloc(nb*sizeof(*inn)); + idx=malloc((nb+2*nbb)*sizeof(*idx)); + + error((inn==NULL)||(idx==NULL),1,"alloc_arrays [dfl_geometry.c]", + "Unable to allocate index arrays"); + + dfl_grid.inn=inn; + dfl_grid.idx=idx; + idx+=nb; + dfl_grid.ipp=idx; + idx+=nbb; + dfl_grid.map=idx; +} + + +static void set_index(void) +{ + int n0,n1,n2,n3; + int in,ic[2],*idx; + + in=0; + ic[0]=0; + ic[1]=dfl_grid.nb/2; + idx=dfl_grid.idx; + + for (n0=0;n01) + { + if (n0==0) + inn[in][0]=nb; + if (n0==(nbl[0]-1)) + inn[in][1]=nb; + } + if (NPROC1>1) + { + if (n1==0) + inn[in][2]=nb; + if (n1==(nbl[1]-1)) + inn[in][3]=nb; + } + if (NPROC2>1) + { + if (n2==0) + inn[in][4]=nb; + if (n2==(nbl[2]-1)) + inn[in][5]=nb; + } + if (NPROC3>1) + { + if (n3==0) + inn[in][6]=nb; + if (n3==(nbl[3]-1)) + inn[in][7]=nb; + } + } + } + } + } + + obbe=dfl_grid.obbe; + obbo=dfl_grid.obbo; + + for (ifc=0;ifc<8;ifc++) + ic[ifc]=0; + + for (in=0;in=nb) + { + ipp[im-nb]=in; + + ip=in; + iq=in; + + while (ip=4). +* +* nmr Number of block minimal residual iterations to be +* used when the SAP smoother is applied. +* +* ncy Number of SAP cycles per inverse iteration. +* +* All these are set by set_dfl_gen_parms(). Additionally, the values of +* parameters +* +* nkv Maximal number of Krylov vectors to be used by the +* solver for the little Dirac equation before a restart. +* +* nmx Maximal total number of Krylov vectors generated by +* the solver for the little Dirac equation. +* +* res Required relative residue when solving the little +* Dirac equation. +* +* are set by set_dfl_pro_parms(). +* +* On exit the argument status[0] reports the average solver iteration numbers +* that were required for the solution of the little Dirac equation. A negative +* value indicates that the program failed (-1: the solver did not converge, -2: +* the inversion of the SW term was not safe, -3: the inversion of the diagonal +* part of the little Dirac operator was not safe). In all these cases, the +* deflation subspace is initialized with the fields that were computed before +* the failure occured. +* +* The programs dfl_modes2() and dfl_update2() can be used in place of the +* programs dfl_modes() and dfl_update(), respectively, if some protection +* against the rare cases, where the little Dirac operator turns out to be +* accidentally ill-conditioned, is desired. +* +* The programs in this module perform global operations and must be called +* simultaneously on all MPI processes. The required workspaces are +* +* spinor Ns+2 (Ns: number of deflation modes per block) +* complex 2*nkv+2 +* complex_dble 4 +* +* (see utils/wspace.c) +* +* Some debugging output is printed to stdout on process 0 if DFL_MODES_DBG is +* defined at compilation time. +* +*******************************************************************************/ + +#define DFL_MODES_C + +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "random.h" +#include "lattice.h" +#include "block.h" +#include "uflds.h" +#include "sflds.h" +#include "vflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "sap.h" +#include "little.h" +#include "dfl.h" +#include "global.h" + +typedef union +{ + spinor s; + float r[24]; +} spin_t; + +static int my_rank,eoflg; +static int Ns=0,nv,nrn; +static double m0; +static complex_dble *cs1,*cs2; +static dfl_pro_parms_t dpr; +static dfl_gen_parms_t dgn; + +#ifdef DFL_MODES_DBG + +static void print_res(spinor **mds) +{ + int k; + double r; + spinor **ws; + + ws=reserve_ws(1); + + for (k=0;k1) + { + MPI_Reduce(cs1,cs2,2*n,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(cs2,2*n,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + { + for (k=0;k0) + { + for (l=0;l %.1e, ratio = %.1e\n",k,r0,r1,r1/r0); + } + + release_ws(); +} + + +static void dfl_smooth_fields(spinor **mds,int *status) +{ + int k,l,stat; + double r0,r1; + complex **vs,**wv; + complex_dble **wvd; + spinor **ws; + + vs=vflds()+Ns; + wv=reserve_wv(1); + wvd=reserve_wvd(1); + ws=reserve_ws(2); + r0=1.0; + r1=1.0; + + for (k=0;k %.1e, ratio = %.1e\n", + k,stat,r0,r1,r1/r0); + + mulr_spinor_add(VOLUME,mds[k],ws[1],1.0f); + + if (status[0]>=0) + { + if (stat>=0) + status[0]+=stat; + else + status[0]=stat; + } + } + + release_ws(); + release_wvd(); + release_wv(); +} + +#else + +static void smooth_fields(int ncy,spinor **mds) +{ + int k,l; + spinor **ws; + + ws=reserve_ws(1); + + for (k=0;k=0) + { + if (stat>=0) + status[0]+=stat; + else + status[0]=stat; + } + } + + release_ws(); + release_wvd(); + release_wv(); +} + +#endif + +void dfl_modes(int *status) +{ + int n,ifail; + spinor **mds; + + status[0]=0; + ifail=set_frame(); + mds=reserve_ws(Ns); + random_fields(mds); + +#ifdef DFL_MODES_DBG + if (my_rank==0) + { + printf("Progress report [program dfl_modes]:\n\n"); + printf("Ns = %d, ninv = %d, nmr = %d, ncy = %d\n", + Ns,dgn.ninv,dgn.nmr,dgn.ncy); + printf("nkv = %d, nmx = %d, res = %.1e, ifail = %d\n\n", + dpr.nkv,dpr.nmx,dpr.res,ifail); + } +#endif + + if (ifail) + { + dfl_subspace(mds); + status[0]=-2; + } + else + { + for (n=0;n<3;n++) + { + smooth_fields(n+1,mds); + +#ifdef DFL_MODES_DBG + print_res(mds); +#endif + } + + for (;n3) + renormalize_fields(mds); + + dfl_subspace(mds); + ifail=set_Awhat(dgn.mu); + + if (ifail) + { + status[0]=-3; + break; + } + else + { + dfl_smooth_fields(mds,status); + nrn+=1; + + if (status[0]<0) + break; + +#ifdef DFL_MODES_DBG + print_res(mds); +#endif + } + } + + if (status[0]>=0) + { + dfl_subspace(mds); + n=Ns*(dgn.ninv-3); + status[0]=(status[0]+n/2)/n; + } + } + + release_ws(); + set_sw_parms(m0); + if (eoflg!=1) + set_tm_parms(eoflg); + +#ifdef DFL_MODES_DBG + if (my_rank==0) + { + printf("status = %d\n",status[0]); + printf("dfl_modes: all done\n\n"); + fflush(stdout); + } +#endif +} + + +void dfl_update(int nsm,int *status) +{ + int n,ifail,iprms[1]; + spinor **mds; + + if (NPROC>1) + { + iprms[0]=nsm; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + + error(iprms[0]!=nsm,1,"dfl_update [dfl_modes.c]", + "Parameters are not global"); + } + + status[0]=0; + ifail=set_frame(); + mds=reserve_ws(Ns); + restore_fields(mds); + +#ifdef DFL_MODES_DBG + if (my_rank==0) + { + printf("Progress report [program dfl_update]:\n\n"); + printf("nsm = %d\n",nsm); + printf("Ns = %d, ninv = %d, nmr = %d, ncy = %d\n", + Ns,dgn.ninv,dgn.nmr,dgn.ncy); + printf("nkv = %d, nmx = %d, res = %.1e, ifail = %d\n\n", + dpr.nkv,dpr.nmx,dpr.res,ifail); + } +#endif + + if (ifail) + status[0]=-2; + else + { + for (n=0;n3)&&(n<(nsm-1))) + renormalize_fields(mds); + + dfl_subspace(mds); + +#ifdef DFL_MODES_DBG + print_res(mds); +#endif + } + } + } + + if (status[0]>0) + { + n=Ns*nsm; + status[0]=(status[0]+n/2)/n; + } + + release_ws(); + set_sw_parms(m0); + if (eoflg!=1) + set_tm_parms(eoflg); + +#ifdef DFL_MODES_DBG + if (my_rank==0) + { + printf("status = %d\n",status[0]); + printf("dfl_update: all done\n\n"); + fflush(stdout); + } +#endif +} + + +void dfl_modes2(int *status) +{ + dfl_modes(status); + + if (status[0]==-3) + { +#ifdef DFL_MODES_DBG + if (my_rank==0) + { + printf("Generation of deflation subspace failed\n"); + printf("Start second attempt\n"); + fflush(stdout); + } +#endif + + dfl_modes(status+1); + } + else + status[1]=0; +} + + +void dfl_update2(int nsm,int *status) +{ + dfl_update(nsm,status); + + if (status[0]==-3) + { +#ifdef DFL_MODES_DBG + if (my_rank==0) + { + printf("Update of deflation subspace failed\n"); + printf("Attempt to regenerate subspace\n"); + fflush(stdout); + } +#endif + + dfl_modes(status+1); + } + else + status[1]=0; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/dfl/dfl_sap_gcr.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/dfl/dfl_sap_gcr.c new file mode 100644 index 0000000000000000000000000000000000000000..9c066e95dceca9f2928c8ddbc3189ccb37fac5fe --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/dfl/dfl_sap_gcr.c @@ -0,0 +1,381 @@ + +/******************************************************************************* +* +* File dfl_sap_gcr.c +* +* Copyright (C) 2007, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* SAP+GCR solver for the Wilson-Dirac equation with local deflation. +* +* The externally accessible functions are +* +* double dfl_sap_gcr(int nkv,int nmx,double res,double mu, +* spinor_dble *eta,spinor_dble *psi,int *status) +* Obtains an approximate solution psi of the Wilson-Dirac equation for +* given source eta using the deflated SAP-preconditioned GCR algorithm. +* See the notes for the explanation of the parameters of the program. +* +* double dfl_sap_gcr2(int nkv,int nmx,double res,double mu, +* spinor_dble *eta,spinor_dble *psi,int *status) +* This program calls dfl_sap_gcr() with the parameters nkv,..,status. +* If the solver fails and status[0]=-3 or status[1]<0, the deflation +* subspace is regenerated by calling dfl_modes(). The solver program +* dfl_sap_gcr() is then called again and the results are passed to +* the calling program. +* +* Depending on whether the twisted-mass flag is set or not, the programs +* solve the equation +* +* (Dw+i*mu*gamma_5*1e)*psi=eta or (Dw+i*mu*gamma_5)*psi=eta +* +* respectively. The twisted-mass flag is retrieved from the parameter data +* base (see flags/lat_parms.c). + +* The program dfl_sap_gcr() is based on the flexible GCR algorithm (see +* linsolv/fgcr.c). Before the solver is launched, the following parameter- +* setting programs must have been called: +* +* set_lat_parms() SW improvement coefficient. +* +* set_bc_parms() Boundary conditions and associated improvement +* coefficients. +* +* set_sw_parms() Bare quark mass. +* +* set_sap_parms() Parameters of the SAP preconditioner. +* +* set_dfl_parms() Parameters of the deflation subspace. +* +* set_dfl_pro_parms() Parameters used for the deflation projection. +* +* See doc/parms.pdf and the relevant files in the modules/flags directory +* for further explanations. The deflation subspace must have been properly +* initialized by the program dfl_subspace(). +* +* All other parameters are passed through the argument list: +* +* nkv Maximal number of Krylov vectors generated before the GCR +* algorithm is restarted. +* +* nmx Maximal total number of Krylov vectors that may be generated. +* +* res Desired maximal relative residue |eta-D*psi|/|eta| of the +* calculated solution. +* +* mu Value of the twisted mass in the Dirac equation. +* +* eta Source field. Note that source fields must vanish at global +* time 0 and NPR0C0*L0-1, as has to be the case for physical +* quark fields. eta is unchanged on exit unless psi=eta (which +* is permissible). +* +* psi Calculated approximate solution of the Dirac equation. psi +* vanishes at global time 0 and NPROC0*L0-1. +* +* The argument status must point to an array of at least 2 and 3 integers +* in the case of the programs dfl_sap_gcr() and dfl_sap_gcr2(). On exit, +* the array elements contain the following values: +* +* status[0] If the program is able to solve the Dirac equation to the +* desired accuracy, status[0] reports the total number of Krylov +* vectors that were required for the solution. Negative values +* indicate that the program failed (-1: the algorithm did not +* converge, -2: the inversion of the SW term on the odd points +* was not safe, -3: the inversion of the diagonal parts of the +* little Dirac operator was not safe). +* +* status[1] Average number of GCR iterations needed for the solution of +* the little Dirac equation in the course of the deflation +* projection. +* +* The program dfl_sap_gcr2() in addition returns +* +* status[2] Average solver iteration numbers that were required for the +* solution of the little Dirac equation when the deflation sub- +* space had to be regenerated (if the regeneration fails, the +* dfl_sap_gcr2() program terminates with an error message). +* +* If status[0]>=-1 and status[1]>=0, the programs return the norm of the +* residue of the calculated approximate solution. Otherwise the field psi +* is set to zero and the program returns the norm of the source eta. +* +* The SAP_BLOCKS blocks grid is automatically allocated or reallocated if +* it is not already allocated with the correct block size. The SW term is +* recalculated when needed and the gauge and SW fields are copied to the +* SAP block grid if they are not in the proper condition. Similarly, the +* little Dirac operator is updated when needed. +* +* The program dfl_sap_gcr2() can be used in place of dfl_sap_gcr() if +* some protection against the rare cases, where the little Dirac operator +* turns out to be accidentally ill-conditioned, is desired. +* +* Evidently the SAP+GCR solver is a global program that must be called on +* all processes simultaneously. The required workspaces are +* +* spinor 2*nkv+2 +* spinor_dble 3 [2 in the case of dfl_sap_gcr()] +* complex 2*nkv_pro+2 +* complex_dble 4 +* +* (see utils/wspace.c), where nkv_pro, the maximal number of Krylov vectors +* generated before the GCR solver of the little Dirac equation is restarted, +* is a parameter set by dfl_pro_parms(). +* +*******************************************************************************/ + +#define DFL_SAP_GCR_C + +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "block.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "linsolv.h" +#include "sap.h" +#include "vflds.h" +#include "little.h" +#include "dfl.h" +#include "global.h" + +static int nit,stat,nv; +static float mus; +static double mud; +static sap_parms_t spr; +static dfl_pro_parms_t dpr; + + +static void Dop(spinor_dble *s,spinor_dble *r) +{ + Dw_dble(mud,s,r); +} + + +static void Mop(int k,spinor *rho,spinor *phi,spinor *chi) +{ + int n,status; + complex **wv; + complex_dble **wvd; + spinor **ws; + + wv=reserve_wv(1); + wvd=reserve_wvd(1); + ws=reserve_ws(1); + + dfl_s2v(rho,wv[0]); + assign_v2vd(nv,wv[0],wvd[0]); + ltl_gcr(dpr.nkv,dpr.nmx,dpr.res,mud,wvd[0],wvd[0],&status); + assign_vd2v(nv,wvd[0],wv[0]); + dfl_v2s(wv[0],ws[0]); + + Dw(mus,ws[0],chi); + diff_s2s(VOLUME,rho,chi); + set_s2zero(VOLUME,phi); + + for (n=0;n=0) + { + if (status>=0) + { + nit+=1; + stat+=status; + } + else + stat=status; + } + + release_ws(); + release_wvd(); + release_wv(); +} + + +double dfl_sap_gcr(int nkv,int nmx,double res,double mu, + spinor_dble *eta,spinor_dble *psi,int *status) +{ + int *bs,nb,isw,ifail; + int swde,swdo,swu,swe,swo; + double rho,rho0,fact; + spinor **ws; + spinor_dble **wsd,**rsd; + dfl_parms_t dfl; + + dfl=dfl_parms(); + error_root(dfl.Ns==0,1,"dfl_sap_gcr [dfl_sap_gcr.c]", + "Deflation parameters are not set"); + bs=dfl.bs; + nv=dfl.Ns*VOLUME/(bs[0]*bs[1]*bs[2]*bs[3]); + + spr=sap_parms(); + error_root(spr.ncy==0,1,"dfl_sap_gcr [dfl_sap_gcr.c]", + "SAP parameters are not set"); + + dpr=dfl_pro_parms(); + error_root(dpr.nkv==0,1,"dfl_sap_gcr [dfl_sap_gcr.c]", + "Deflation projector parameters are not set"); + + blk_list(SAP_BLOCKS,&nb,&isw); + + if (nb==0) + alloc_bgr(SAP_BLOCKS); + + if (query_grid_flags(SAP_BLOCKS,UBGR_MATCH_UD)!=1) + assign_ud2ubgr(SAP_BLOCKS); + + if (query_flags(SWD_UP2DATE)!=1) + sw_term(NO_PTS); + + swde=query_flags(SWD_E_INVERTED); + swdo=query_flags(SWD_O_INVERTED); + + swu=query_grid_flags(SAP_BLOCKS,SW_UP2DATE); + swe=query_grid_flags(SAP_BLOCKS,SW_E_INVERTED); + swo=query_grid_flags(SAP_BLOCKS,SW_O_INVERTED); + ifail=0; + + if (spr.isolv==0) + { + if ((swde==1)||(swdo==1)) + sw_term(NO_PTS); + + if ((swu!=1)||(swe==1)||(swo==1)) + assign_swd2swbgr(SAP_BLOCKS,NO_PTS); + } + else if (spr.isolv==1) + { + if ((swde!=1)&&(swdo==1)) + { + if ((swu!=1)||(swe==1)||(swo!=1)) + assign_swd2swbgr(SAP_BLOCKS,NO_PTS); + + sw_term(NO_PTS); + } + else + { + if ((swde==1)||(swdo==1)) + sw_term(NO_PTS); + + if ((swu!=1)||(swe==1)||(swo!=1)) + ifail=assign_swd2swbgr(SAP_BLOCKS,ODD_PTS); + } + } + else + error_root(1,1,"dfl_sap_gcr [dfl_sap_gcr.c]","Unknown block solver"); + + if (query_flags(U_MATCH_UD)!=1) + assign_ud2u(); + + if ((query_flags(SW_UP2DATE)!=1)|| + (query_flags(SW_E_INVERTED)==1)||(query_flags(SW_O_INVERTED)==1)) + assign_swd2sw(); + + rho0=sqrt(norm_square_dble(VOLUME,1,eta)); + rho=rho0; + status[0]=0; + status[1]=0; + + if (ifail) + status[0]=-2; + else + { + ifail=set_Awhat(mu); + + if (ifail) + status[0]=-3; + else + { + ws=reserve_ws(2*nkv+1); + wsd=reserve_wsd(1); + rsd=reserve_wsd(1); + + nit=0; + stat=0; + mus=(float)(mu); + mud=mu; + + fact=rho0/sqrt((double)(VOLUME)*(double)(24*NPROC)); + + if (fact!=0.0) + { + assign_sd2sd(VOLUME,eta,rsd[0]); + scale_dble(VOLUME,1.0/fact,rsd[0]); + + rho=fgcr(VOLUME,1,Dop,Mop,ws,wsd,nkv,nmx,res,rsd[0],psi,status); + + scale_dble(VOLUME,fact,psi); + rho*=fact; + + if ((nit>0)&&(stat>=0)) + status[1]=(stat+nit/2)/nit; + else if (stat<0) + status[1]=stat; + } + else + { + rho=0.0; + set_sd2zero(VOLUME,psi); + } + + release_wsd(); + release_wsd(); + release_ws(); + } + } + + if ((status[0]<-1)||(status[1]<0)) + { + rho=rho0; + set_sd2zero(VOLUME,psi); + } + + return rho; +} + + +double dfl_sap_gcr2(int nkv,int nmx,double res,double mu, + spinor_dble *eta,spinor_dble *psi,int *status) +{ + double rho; + spinor_dble **wsd; + + wsd=reserve_wsd(1); + + if (eta==psi) + { + assign_sd2sd(VOLUME,eta,wsd[0]); + eta=wsd[0]; + } + + rho=dfl_sap_gcr(nkv,nmx,res,mu,eta,psi,status); + + if ((status[0]==-3)||(status[1]<0)) + { + dfl_modes(status+2); + + error_root(status[2]<0,1,"dfl_sap_gcr2 [dfl_sap_gcr.c]", + "Deflation subspace regeneration failed (status = %d)", + status[2]); + + rho=dfl_sap_gcr(nkv,nmx,res,mu,eta,psi,status); + } + else + status[2]=0; + + release_wsd(); + + return rho; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/dfl/dfl_subspace.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/dfl/dfl_subspace.c new file mode 100644 index 0000000000000000000000000000000000000000..c196014100807d65a118e40b6aaeadd2f5270008 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/dfl/dfl_subspace.c @@ -0,0 +1,378 @@ + +/******************************************************************************* +* +* File dfl_subspace.c +* +* Copyright (C) 2007, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Basic utility programs related to the deflation subspace. +* +* The externally accessible functions are +* +* void dfl_sd2vd(spinor_dble *sd,complex_dble *vd) +* Assigns the components of the global double-precision spinor field +* sd along the deflation subspace to the double-precision vector +* field vd. +* +* void dfl_vd2sd(complex_dble *vd,spinor_dble *sd) +* Assigns the element of the deflation subspace corresponding to the +* double-precision vector field vd to the global double-precision spinor +* field sd. +* +* void dfl_sub_vd2sd(complex_dble *vd,spinor_dble *sd) +* Subtracts the element of the deflation subspace corresponding to the +* double-precision vector field vd from the global double-precision +* spinor field sd. +* +* void dfl_s2v(spinor *s,complex *v) +* Assigns the components of the global single-precision spinor field +* s along the deflation subspace to the single-precision vector +* field v. +* +* void dfl_v2s(complex *v,spinor *s) +* Assigns the element of the deflation subspace corresponding to the +* single-precision vector field v to the global single-precision spinor +* field s. +* +* void dfl_sub_v2s(complex *v,spinor *s) +* Subtracts the element of the deflation subspace corresponding to the +* double-precision vector field v from the global single-precision spinor +* field s. +* +* void dfl_subspace(spinor **mds) +* Copies the global single-precision spinor fields mds[0],..,mds[Ns-1] +* to the fields b.sd[1],..,b.sd[Ns] on the blocks b of the DFL_BLOCKS +* grid. The block fields are then orthonormalized and are assigned to +* the single-precision block fields b.s[1],..,b.s[Ns]. +* In this basis of fields, the modes mds[0],..,mds[Ns-1] are given by +* fields vmds[0],..,vmds[Ns-1] of Ns*nb complex numbers, where nb is +* the number of blocks in the block grid. These fields are assigned to +* the last Ns single-precision vector fields of the array returned by +* vflds() [vflds/vflds.c]. +* +* Notes: +* +* The deflation subspace is spanned by the fields (*b).sd[1],..,(*b).sd[Ns] +* on the blocks b of the DFL_BLOCKS grid. The number Ns of fields is set by +* the program dfl_set_parms() [flags/dfl_parms.c]. +* +* Any spinor field in the deflation subspace is a linear combination of the +* basis elements on the blocks. The associated complex coefficients form a +* vector field of the type described in vflds/vflds.c. Such fields are thus +* in one-to-one correspondence with the deflation modes. In particular, the +* deflation subspace contains the global spinor fields from which it was +* created by the program dfl_subspace(). +* +* The program dfl_subspace() allocates the DFL_BLOCKS block grid if it is +* not already allocated. This program involves global operations and must be +* called simultaneously on all processes. +* +*******************************************************************************/ + +#define DFL_SUBSPACE_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "linalg.h" +#include "sflds.h" +#include "block.h" +#include "vflds.h" +#include "dfl.h" +#include "global.h" + + +void dfl_sd2vd(spinor_dble *sd,complex_dble *vd) +{ + int Ns,nb,nbh,isw; + int n,m,i,vol; + block_t *b; + spinor_dble **sdb; + dfl_parms_t dfl; + + dfl=dfl_parms(); + Ns=dfl.Ns; + b=blk_list(DFL_BLOCKS,&nb,&isw); + nbh=nb/2; + vol=(*b).vol; + + for (n=0;n=-1, the program returns the norm of the residue of the +* calculated approximate solution of the even-odd preconditioned, globally +* deflated little Dirac equation. No action is performed if status=-2 +* and the program returns 1.0. +* +* The even-odd preconditioned little Dirac operator is updated if it is +* not up-to-date. Evidently the solver is a global program that must be +* called on all processes simultaneously. The required workspaces are +* +* complex 2*nkv+1 +* complex_dble 3 +* +* (see utils/wspace.c). +* +*******************************************************************************/ + +#define LTL_GCR_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "flags.h" +#include "vflds.h" +#include "linalg.h" +#include "linsolv.h" +#include "little.h" +#include "dfl.h" +#include "global.h" + +static int Ns=0,nv,nvh; +static double rvol; +static complex **vs; +static complex_dble **vds,*awd,*cs1,*cs2; + + +static void set_constants(void) +{ + dfl_parms_t dfl; + dfl_grid_t grd; + + dfl=dfl_parms(); + grd=dfl_geometry(); + + Ns=dfl.Ns; + nv=Ns*grd.nb; + nvh=nv/2; + rvol=1.0/sqrt((double)(nv)*(double)(NPROC)); + + vs=vflds(); + vds=vdflds(); + awd=ltl_matrix(); + + cs1=amalloc(2*Ns*sizeof(*cs1),ALIGN); + error(cs1==NULL,1,"set_constants [ltl_gcr.c]", + "Unable to allocate auxiliary arrays"); + cs2=cs1+Ns; +} + + +static void sum_vprod(int n,complex_dble *z,complex_dble *w) +{ + int k; + + if (NPROC>1) + { + MPI_Reduce(z,w,2*n,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(w,2*n,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + { + for (k=0;k +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "flags.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "sw_term.h" +#include "block.h" +#include "dirac.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +typedef union +{ + spinor s; + weyl w[2]; +} spin_t; + +static float coe,ceo; +static const spinor s0={{{0.0}}}; +static spin_t rs ALIGNED32; + +#if (defined AVX) +#include "avx.h" + +#define _load_cst(c) \ +__asm__ __volatile__ ("vbroadcastss %0, %%ymm15 \n\t" \ + : \ + : \ + "m" (c) \ + : \ + "xmm15") + +#define _mul_cst() \ +__asm__ __volatile__ ("vmulps %%ymm15, %%ymm0, %%ymm0 \n\t" \ + "vmulps %%ymm15, %%ymm1, %%ymm1 \n\t" \ + "vmulps %%ymm15, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + + +static void doe(int *piup,int *pidn,su3 *u,spinor *pk) +{ + spinor *sp,*sm; + +/******************************** direction 0 *********************************/ + + sp=pk+piup[0]; + sm=pk+pidn[0]; + + _avx_spinor_pair_load34(*sp,*sm); + + sp=pk+piup[1]; + sm=pk+pidn[1]; + _prefetch_spinor(sp); + _prefetch_spinor(sm); + + _avx_spinor_mul_up(_avx_sgn_add); + _avx_spinor_add(); + + _avx_su3_pair_mixed_multiply(u[0],u[1]); + + _avx_spinor_split(); + _avx_spinor_unsplit(); + _avx_spinor_store_up(rs.s); + +/******************************** direction 1 *********************************/ + + _avx_spinor_pair_load43(*sp,*sm); + + sp=pk+piup[2]; + sm=pk+pidn[2]; + _prefetch_spinor(sp); + _prefetch_spinor(sm); + + _avx_spinor_imul_up(_avx_sgn_i_add); + _avx_spinor_add(); + + _avx_su3_pair_mixed_multiply(u[2],u[3]); + + _avx_spinor_split(); + _avx_spinor_load(rs.s); + _avx_weyl_xch_imul(_sse_sgn24); + _avx_spinor_unsplit(); + _avx_spinor_add(); + _avx_spinor_store(rs.s); + +/******************************** direction 2 *********************************/ + + _avx_spinor_pair_load43(*sp,*sm); + + sp=pk+piup[3]; + sm=pk+pidn[3]; + _prefetch_spinor(sp); + _prefetch_spinor(sm); + + _avx_spinor_mul_up(_avx_sgn_addsub); + _avx_spinor_add(); + + _avx_su3_pair_mixed_multiply(u[4],u[5]); + + _avx_spinor_split(); + _avx_spinor_load(rs.s); + _avx_weyl_xch(); + _avx_weyl_mul(_sse_sgn12); + _avx_spinor_unsplit(); + _avx_spinor_add(); + _avx_spinor_store(rs.s); + +/******************************** direction 3 *********************************/ + + _avx_spinor_pair_load34(*sp,*sm); + _avx_spinor_imul_up(_avx_sgn_i_addsub); + _avx_spinor_add(); + + _avx_su3_pair_mixed_multiply(u[6],u[7]); + + _avx_spinor_split(); + _avx_spinor_load(rs.s); + _avx_weyl_imul(_sse_sgn23); + _avx_spinor_unsplit(); + _load_cst(coe); + _avx_spinor_add(); + _mul_cst(); + _avx_weyl_pair_store12(rs.w[0],rs.w[1]); + + _avx_zeroupper(); +} + + +static void deo(int *piup,int *pidn,su3 *u,spinor *pl) +{ + spinor *sp,*sm; + + _load_cst(ceo); + _avx_spinor_load(rs.s); + _mul_cst(); + _avx_spinor_store(rs.s); + +/******************************** direction 0 *********************************/ + + sm=pl+pidn[0]; + sp=pl+piup[0]; + + _prefetch_spinor(sm); + _prefetch_spinor(sp); + + _avx_spinor_load_dup(rs.s); + _avx_spinor_mul_up(_avx_sgn_add); + _avx_spinor_add(); + + _avx_su3_pair_mixed_multiply(u[1],u[0]); + + _avx_weyl_pair_load12(*sm,*sp); + _avx_spinor_add(); + _avx_weyl_pair_store12(*sm,*sp); + + _avx_weyl_pair_load34(*sm,*sp); + _avx_spinor_mul_up(_avx_sgn_add); + _avx_spinor_add(); + _avx_weyl_pair_store34(*sm,*sp); + +/******************************** direction 1 *********************************/ + + sm=pl+pidn[1]; + sp=pl+piup[1]; + + _prefetch_spinor(sm); + _prefetch_spinor(sp); + + _avx_spinor_load_dup(rs.s); + _avx_spinor_xch_imul_up(_avx_sgn_i_add); + _avx_spinor_add(); + + _avx_su3_pair_mixed_multiply(u[3],u[2]); + + _avx_weyl_pair_load12(*sm,*sp); + _avx_spinor_add(); + _avx_weyl_pair_store12(*sm,*sp); + + _avx_weyl_pair_load34(*sm,*sp); + _avx_spinor_xch_imul_up(_avx_sgn_i_add); + _avx_spinor_sub(); + _avx_weyl_pair_store34(*sm,*sp); + +/******************************** direction 2 *********************************/ + + sm=pl+pidn[2]; + sp=pl+piup[2]; + + _prefetch_spinor(sm); + _prefetch_spinor(sp); + + _avx_spinor_load_dup(rs.s); + _avx_spinor_xch_up(); + _avx_spinor_mul_up(_avx_sgn_addsub); + _avx_spinor_add(); + + _avx_su3_pair_mixed_multiply(u[5],u[4]); + + _avx_weyl_pair_load12(*sm,*sp); + _avx_spinor_add(); + _avx_weyl_pair_store12(*sm,*sp); + + _avx_weyl_pair_load34(*sm,*sp); + _avx_spinor_xch_up(); + _avx_spinor_mul_up(_avx_sgn_addsub); + _avx_spinor_sub(); + _avx_weyl_pair_store34(*sm,*sp); + +/******************************** direction 3 *********************************/ + + sm=pl+pidn[3]; + sp=pl+piup[3]; + + _prefetch_spinor(sm); + _prefetch_spinor(sp); + + _avx_spinor_load_dup(rs.s); + _avx_spinor_imul_up(_avx_sgn_i_addsub); + _avx_spinor_add(); + + _avx_su3_pair_mixed_multiply(u[7],u[6]); + + _avx_weyl_pair_load12(*sm,*sp); + _avx_spinor_add(); + _avx_weyl_pair_store12(*sm,*sp); + + _avx_weyl_pair_load34(*sm,*sp); + _avx_spinor_imul_up(_avx_sgn_i_addsub); + _avx_spinor_sub(); + _avx_weyl_pair_store34(*sm,*sp); + + _avx_zeroupper(); +} + +#elif (defined x64) +#include "sse2.h" + +#define _load_cst(c) \ +__asm__ __volatile__ ("movss %0, %%xmm15 \n\t" \ + "shufps $0x0, %%xmm15, %%xmm15" \ + : \ + : \ + "m" (c) \ + : \ + "xmm15") + +#define _mul_cst() \ +__asm__ __volatile__ ("mulps %%xmm15, %%xmm0 \n\t" \ + "mulps %%xmm15, %%xmm1 \n\t" \ + "mulps %%xmm15, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +#define _mul_cst_up() \ +__asm__ __volatile__ ("mulps %%xmm15, %%xmm3 \n\t" \ + "mulps %%xmm15, %%xmm4 \n\t" \ + "mulps %%xmm15, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + + +static void doe(int *piup,int *pidn,su3 *u,spinor *pk) +{ + spinor *sp,*sm; + +/******************************* direction +0 *********************************/ + + sp=pk+(*(piup++)); + + _sse_pair_load((*sp).c1,(*sp).c2); + _sse_pair_load_up((*sp).c3,(*sp).c4); + + sm=pk+(*(pidn++)); + _prefetch_spinor(sm); + + _sse_vector_add(); + sp=pk+(*(piup++)); + _prefetch_spinor(sp); + _sse_su3_multiply(*u); + + _sse_weyl_store_up(rs.w[0]); + _sse_weyl_store_up(rs.w[1]); + +/******************************* direction -0 *********************************/ + + _sse_pair_load((*sm).c1,(*sm).c2); + _sse_pair_load_up((*sm).c3,(*sm).c4); + + u+=2; + _prefetch_su3_dble(u); + u-=1; + _sse_vector_sub(); + sm=pk+(*(pidn++)); + _prefetch_spinor(sm); + _sse_su3_inverse_multiply(*u); + + _sse_weyl_load(rs.w[0]); + _sse_vector_add(); + _sse_weyl_store(rs.w[0]); + + _sse_weyl_load(rs.w[1]); + _sse_vector_sub(); + _sse_weyl_store(rs.w[1]); + +/******************************* direction +1 *********************************/ + + _sse_pair_load((*sp).c1,(*sp).c2); + _sse_pair_load_up((*sp).c4,(*sp).c3); + + _sse_vector_i_add(); + sp=pk+(*(piup++)); + _prefetch_spinor(sp); + u+=1; + _sse_su3_multiply(*u); + + _sse_weyl_load(rs.w[0]); + _sse_vector_add(); + _sse_weyl_store(rs.w[0]); + + _sse_weyl_load(rs.w[1]); + _sse_vector_xch_i_sub(); + _sse_weyl_store(rs.w[1]); + +/******************************* direction -1 *********************************/ + + _sse_pair_load((*sm).c1,(*sm).c2); + _sse_pair_load_up((*sm).c4,(*sm).c3); + + u+=2; + _prefetch_su3_dble(u); + u-=1; + _sse_vector_i_sub(); + sm=pk+(*(pidn++)); + _prefetch_spinor(sm); + _sse_su3_inverse_multiply(*u); + + _sse_weyl_load(rs.w[0]); + _sse_vector_add(); + _sse_weyl_store(rs.w[0]); + + _sse_weyl_load(rs.w[1]); + _sse_vector_xch_i_add(); + _sse_weyl_store(rs.w[1]); + +/******************************* direction +2 *********************************/ + + _sse_pair_load((*sp).c1,(*sp).c2); + _sse_pair_load_up((*sp).c4,(*sp).c3); + + _sse_vector_addsub(); + + u+=1; + _sse_su3_multiply(*u); + sp=pk+(*(piup)); + _prefetch_spinor(sp); + _sse_weyl_load(rs.w[0]); + _sse_vector_add(); + _sse_weyl_store(rs.w[0]); + + _sse_weyl_load(rs.w[1]); + _sse_vector_xch(); + _sse_vector_subadd(); + _sse_weyl_store(rs.w[1]); + +/******************************* direction -2 *********************************/ + + _sse_pair_load((*sm).c1,(*sm).c2); + _sse_pair_load_up((*sm).c4,(*sm).c3); + + u+=2; + _prefetch_su3_dble(u); + u-=1; + _sse_vector_subadd(); + sm=pk+(*(pidn)); + _prefetch_spinor(sm); + _sse_su3_inverse_multiply(*u); + + _sse_weyl_load(rs.w[0]); + _sse_vector_add(); + _sse_weyl_store(rs.w[0]); + + _sse_weyl_load(rs.w[1]); + _sse_vector_xch(); + _sse_vector_addsub(); + _sse_weyl_store(rs.w[1]); + +/******************************* direction +3 *********************************/ + + _sse_pair_load((*sp).c1,(*sp).c2); + _sse_pair_load_up((*sp).c3,(*sp).c4); + + _sse_vector_i_addsub(); + u+=1; + _sse_su3_multiply(*u); + + _sse_weyl_load(rs.w[0]); + _sse_vector_add(); + _sse_weyl_store(rs.w[0]); + + _sse_weyl_load(rs.w[1]); + _sse_vector_i_subadd(); + _sse_weyl_store(rs.w[1]); + +/******************************* direction -3 *********************************/ + + _sse_pair_load((*sm).c1,(*sm).c2); + _sse_pair_load_up((*sm).c3,(*sm).c4); + + u+=2; + _prefetch_su3_dble(u); + u-=1; + _sse_vector_i_subadd(); + _sse_su3_inverse_multiply(*u); + + _load_cst(coe); + _sse_weyl_load(rs.w[0]); + _sse_vector_add(); + _mul_cst(); + _sse_pair_store(rs.s.c1,rs.s.c2); + + _sse_weyl_load(rs.w[1]); + _sse_vector_i_addsub(); + _mul_cst(); + _sse_pair_store(rs.s.c3,rs.s.c4); +} + + +static void deo(int *piup,int *pidn,su3 *u,spinor *pl) +{ + spinor *sp,*sm; + +/******************************* direction +0 *********************************/ + + sp=pl+(*(piup++)); + _prefetch_spinor(sp); + + _load_cst(ceo); + _sse_pair_load(rs.s.c1,rs.s.c2); + _sse_pair_load_up(rs.s.c3,rs.s.c4); + _mul_cst(); + _mul_cst_up(); + _sse_weyl_store(rs.w[0]); + _sse_weyl_store_up(rs.w[1]); + + sm=pl+(*(pidn++)); + _prefetch_spinor(sm); + _sse_vector_sub(); + _sse_su3_inverse_multiply(*u); + + _sse_pair_load((*sp).c1,(*sp).c2); + _sse_vector_add(); + _sse_pair_store((*sp).c1,(*sp).c2); + + _sse_pair_load((*sp).c3,(*sp).c4); + _sse_vector_sub(); + _sse_pair_store((*sp).c3,(*sp).c4); + +/******************************* direction -0 *********************************/ + + _sse_weyl_load(rs.w[0]); + _sse_weyl_load_up(rs.w[1]); + + sp=pl+(*(piup++)); + _prefetch_spinor(sp); + _sse_vector_add(); + u+=1; + _sse_su3_multiply(*u); + + _sse_pair_load((*sm).c1,(*sm).c2); + _sse_vector_add(); + _sse_pair_store((*sm).c1,(*sm).c2); + + _sse_pair_load((*sm).c3,(*sm).c4); + _sse_vector_add(); + _sse_pair_store((*sm).c3,(*sm).c4); + +/******************************* direction +1 *********************************/ + + _sse_weyl_load(rs.w[0]); + _sse_weyl_load_up(rs.w[1]); + + sm=pl+(*(pidn++)); + _prefetch_spinor(sm); + _sse_vector_xch_i_sub(); + u+=1; + _sse_su3_inverse_multiply(*u); + + _sse_pair_load((*sp).c1,(*sp).c2); + _sse_vector_add(); + _sse_pair_store((*sp).c1,(*sp).c2); + + _sse_pair_load((*sp).c3,(*sp).c4); + _sse_vector_xch_i_add(); + _sse_pair_store((*sp).c3,(*sp).c4); + +/******************************* direction -1 *********************************/ + + _sse_weyl_load(rs.w[0]); + _sse_weyl_load_up(rs.w[1]); + + sp=pl+(*(piup++)); + _prefetch_spinor(sp); + _sse_vector_xch_i_add(); + u+=1; + _sse_su3_multiply(*u); + + _sse_pair_load((*sm).c1,(*sm).c2); + _sse_vector_add(); + _sse_pair_store((*sm).c1,(*sm).c2); + + _sse_pair_load((*sm).c3,(*sm).c4); + _sse_vector_xch_i_sub(); + _sse_pair_store((*sm).c3,(*sm).c4); + +/******************************* direction +2 *********************************/ + + _sse_weyl_load(rs.w[0]); + _sse_weyl_load_up(rs.w[1]); + + sm=pl+(*(pidn++)); + _prefetch_spinor(sm); + _sse_vector_xch(); + _sse_vector_subadd(); + u+=1; + _sse_su3_inverse_multiply(*u); + + _sse_pair_load((*sp).c1,(*sp).c2); + _sse_vector_add(); + _sse_pair_store((*sp).c1,(*sp).c2); + + _sse_pair_load((*sp).c3,(*sp).c4); + _sse_vector_xch(); + _sse_vector_addsub(); + _sse_pair_store((*sp).c3,(*sp).c4); + +/******************************* direction -2 *********************************/ + + _sse_weyl_load(rs.w[0]); + _sse_weyl_load_up(rs.w[1]); + + sp=pl+(*(piup)); + _prefetch_spinor(sp); + _sse_vector_xch(); + _sse_vector_addsub(); + u+=1; + _sse_su3_multiply(*u); + + _sse_pair_load((*sm).c1,(*sm).c2); + _sse_vector_add(); + _sse_pair_store((*sm).c1,(*sm).c2); + + _sse_pair_load((*sm).c3,(*sm).c4); + _sse_vector_xch(); + _sse_vector_subadd(); + _sse_pair_store((*sm).c3,(*sm).c4); + +/******************************* direction +3 *********************************/ + + _sse_weyl_load(rs.w[0]); + _sse_weyl_load_up(rs.w[1]); + + sm=pl+(*(pidn)); + _prefetch_spinor(sm); + _sse_vector_i_subadd(); + u+=1; + _sse_su3_inverse_multiply(*u); + + _sse_pair_load((*sp).c1,(*sp).c2); + _sse_vector_add(); + _sse_pair_store((*sp).c1,(*sp).c2); + + _sse_pair_load((*sp).c3,(*sp).c4); + _sse_vector_i_addsub(); + _sse_pair_store((*sp).c3,(*sp).c4); + +/******************************* direction -3 *********************************/ + + _sse_weyl_load(rs.w[0]); + _sse_weyl_load_up(rs.w[1]); + + _sse_vector_i_addsub(); + u+=1; + _sse_su3_multiply(*u); + + _sse_pair_load((*sm).c1,(*sm).c2); + _sse_vector_add(); + _sse_pair_store((*sm).c1,(*sm).c2); + + _sse_pair_load((*sm).c3,(*sm).c4); + _sse_vector_i_subadd(); + _sse_pair_store((*sm).c3,(*sm).c4); +} + +#else + +#define _vector_mul_assign(r,c) \ + (r).c1.re*=(c); \ + (r).c1.im*=(c); \ + (r).c2.re*=(c); \ + (r).c2.im*=(c); \ + (r).c3.re*=(c); \ + (r).c3.im*=(c) + + +static void doe(int *piup,int *pidn,su3 *u,spinor *pk) +{ + spinor *sp,*sm; + su3_vector psi,chi; + +/******************************* direction +0 *********************************/ + + sp=pk+(*(piup++)); + + _vector_add(psi,(*sp).c1,(*sp).c3); + _su3_multiply(rs.s.c1,*u,psi); + rs.s.c3=rs.s.c1; + + _vector_add(psi,(*sp).c2,(*sp).c4); + _su3_multiply(rs.s.c2,*u,psi); + rs.s.c4=rs.s.c2; + +/******************************* direction -0 *********************************/ + + sm=pk+(*(pidn++)); + u+=1; + + _vector_sub(psi,(*sm).c1,(*sm).c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_sub_assign(rs.s.c3,chi); + + _vector_sub(psi,(*sm).c2,(*sm).c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_sub_assign(rs.s.c4,chi); + +/******************************* direction +1 *********************************/ + + sp=pk+(*(piup++)); + u+=1; + + _vector_i_add(psi,(*sp).c1,(*sp).c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_i_sub_assign(rs.s.c4,chi); + + _vector_i_add(psi,(*sp).c2,(*sp).c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_i_sub_assign(rs.s.c3,chi); + +/******************************* direction -1 *********************************/ + + sm=pk+(*(pidn++)); + u+=1; + + _vector_i_sub(psi,(*sm).c1,(*sm).c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_i_add_assign(rs.s.c4,chi); + + _vector_i_sub(psi,(*sm).c2,(*sm).c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_i_add_assign(rs.s.c3,chi); + +/******************************* direction +2 *********************************/ + + sp=pk+(*(piup++)); + u+=1; + + _vector_add(psi,(*sp).c1,(*sp).c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_add_assign(rs.s.c4,chi); + + _vector_sub(psi,(*sp).c2,(*sp).c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_sub_assign(rs.s.c3,chi); + +/******************************* direction -2 *********************************/ + + sm=pk+(*(pidn++)); + u+=1; + + _vector_sub(psi,(*sm).c1,(*sm).c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_sub_assign(rs.s.c4,chi); + + _vector_add(psi,(*sm).c2,(*sm).c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_add_assign(rs.s.c3,chi); + +/******************************* direction +3 *********************************/ + + sp=pk+(*(piup)); + u+=1; + + _vector_i_add(psi,(*sp).c1,(*sp).c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_i_sub_assign(rs.s.c3,chi); + + _vector_i_sub(psi,(*sp).c2,(*sp).c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_i_add_assign(rs.s.c4,chi); + +/******************************* direction -3 *********************************/ + + sm=pk+(*(pidn)); + u+=1; + + _vector_i_sub(psi,(*sm).c1,(*sm).c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_i_add_assign(rs.s.c3,chi); + + _vector_i_add(psi,(*sm).c2,(*sm).c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_i_sub_assign(rs.s.c4,chi); + + _vector_mul_assign(rs.s.c1,coe); + _vector_mul_assign(rs.s.c2,coe); + _vector_mul_assign(rs.s.c3,coe); + _vector_mul_assign(rs.s.c4,coe); +} + + +static void deo(int *piup,int *pidn,su3 *u,spinor *pl) +{ + spinor *sp,*sm; + su3_vector psi,chi; + + _vector_mul_assign(rs.s.c1,ceo); + _vector_mul_assign(rs.s.c2,ceo); + _vector_mul_assign(rs.s.c3,ceo); + _vector_mul_assign(rs.s.c4,ceo); + +/******************************* direction +0 *********************************/ + + sp=pl+(*(piup++)); + + _vector_sub(psi,rs.s.c1,rs.s.c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c1,chi); + _vector_sub_assign((*sp).c3,chi); + + _vector_sub(psi,rs.s.c2,rs.s.c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c2,chi); + _vector_sub_assign((*sp).c4,chi); + +/******************************* direction -0 *********************************/ + + sm=pl+(*(pidn++)); + u+=1; + + _vector_add(psi,rs.s.c1,rs.s.c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c1,chi); + _vector_add_assign((*sm).c3,chi); + + _vector_add(psi,rs.s.c2,rs.s.c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c2,chi); + _vector_add_assign((*sm).c4,chi); + +/******************************* direction +1 *********************************/ + + sp=pl+(*(piup++)); + u+=1; + + _vector_i_sub(psi,rs.s.c1,rs.s.c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c1,chi); + _vector_i_add_assign((*sp).c4,chi); + + _vector_i_sub(psi,rs.s.c2,rs.s.c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c2,chi); + _vector_i_add_assign((*sp).c3,chi); + +/******************************* direction -1 *********************************/ + + sm=pl+(*(pidn++)); + u+=1; + + _vector_i_add(psi,rs.s.c1,rs.s.c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c1,chi); + _vector_i_sub_assign((*sm).c4,chi); + + _vector_i_add(psi,rs.s.c2,rs.s.c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c2,chi); + _vector_i_sub_assign((*sm).c3,chi); + +/******************************* direction +2 *********************************/ + + sp=pl+(*(piup++)); + u+=1; + + _vector_sub(psi,rs.s.c1,rs.s.c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c1,chi); + _vector_sub_assign((*sp).c4,chi); + + _vector_add(psi,rs.s.c2,rs.s.c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c2,chi); + _vector_add_assign((*sp).c3,chi); + +/******************************* direction -2 *********************************/ + + sm=pl+(*(pidn++)); + u+=1; + + _vector_add(psi,rs.s.c1,rs.s.c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c1,chi); + _vector_add_assign((*sm).c4,chi); + + _vector_sub(psi,rs.s.c2,rs.s.c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c2,chi); + _vector_sub_assign((*sm).c3,chi); + +/******************************* direction +3 *********************************/ + + sp=pl+(*(piup)); + u+=1; + + _vector_i_sub(psi,rs.s.c1,rs.s.c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c1,chi); + _vector_i_add_assign((*sp).c3,chi); + + _vector_i_add(psi,rs.s.c2,rs.s.c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c2,chi); + _vector_i_sub_assign((*sp).c4,chi); + +/******************************* direction -3 *********************************/ + + sm=pl+(*(pidn)); + u+=1; + + _vector_i_add(psi,rs.s.c1,rs.s.c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c1,chi); + _vector_i_sub_assign((*sm).c3,chi); + + _vector_i_sub(psi,rs.s.c2,rs.s.c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c2,chi); + _vector_i_add_assign((*sm).c4,chi); +} + +#endif + +void Dw(float mu,spinor *s,spinor *r) +{ + int bc,ix,t; + int *piup,*pidn; + su3 *u,*um; + pauli *m; + spin_t *so,*ro; + tm_parms_t tm; + + cps_int_bnd(0x1,s); + m=swfld(); + apply_sw(VOLUME/2,mu,m,s,r); + set_s2zero(BNDRY/2,r+VOLUME); + tm=tm_parms(); + if (tm.eoflg==1) + mu=0.0f; + + coe=-0.5f; + ceo=-0.5f; + bc=bc_type(); + piup=iup[VOLUME/2]; + pidn=idn[VOLUME/2]; + + so=(spin_t*)(s+(VOLUME/2)); + ro=(spin_t*)(r+(VOLUME/2)); + m+=VOLUME; + u=ufld(); + um=u+4*VOLUME; + + if (((cpr[0]==0)&&(bc!=3))||((cpr[0]==(NPROC0-1))&&(bc==0))) + { + ix=VOLUME/2; + + for (;u0)&&((t<(N0-1))||(bc!=0))) + { + doe(piup,pidn,u,s); + + mul_pauli2(mu,m,&((*so).s),&((*ro).s)); + + _vector_add_assign((*ro).s.c1,rs.s.c1); + _vector_add_assign((*ro).s.c2,rs.s.c2); + _vector_add_assign((*ro).s.c3,rs.s.c3); + _vector_add_assign((*ro).s.c4,rs.s.c4); + rs=(*so); + + deo(piup,pidn,u,r); + } + else + { + (*so).s=s0; + (*ro).s=s0; + } + + piup+=4; + pidn+=4; + so+=1; + ro+=1; + m+=2; + } + } + else + { + for (;u0)&&((t<(N0-1))||(bc!=0))) + mul_pauli2(mu,m,&((*se).s),&((*re).s)); + else + { + (*se).s=s0; + (*re).s=s0; + } + + se+=1; + re+=1; + } + } + else + { + for (;m0)&&((t<(N0-1))||(bc!=0))) + mul_pauli2(mu,m,&((*so).s),&((*ro).s)); + else + { + (*so).s=s0; + (*ro).s=s0; + } + + so+=1; + ro+=1; + } + } + else + { + for (;m0)&&((t<(N0-1))||(bc!=0))) + { + doe(piup,pidn,u,s); + (*ro)=rs; + } + else + (*ro).s=s0; + + piup+=4; + pidn+=4; + ro+=1; + } + } + else + { + for (;u0)&&((t<(N0-1))||(bc!=0))) + { + rs=(*so); + deo(piup,pidn,u,r); + } + else + (*so).s=s0; + + piup+=4; + pidn+=4; + so+=1; + } + } + else + { + for (;u0)&&((t<(N0-1))||(bc!=0))) + { + doe(piup,pidn,u,s); + + mul_pauli2(0.0f,m,&(rs.s),&(rs.s)); + + deo(piup,pidn,u,r); + } + + piup+=4; + pidn+=4; + m+=2; + } + } + else + { + for (;u=nb)) + { + error_loc(1,1,"Dw_blk [Dw.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k==l)||(k>=(*b).ns)||(l>=(*b).ns)||((*b).u==NULL)) + { + error_loc(1,1,"Dw_blk [Dw.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + volh=vol/2; + s=(*b).s[k]; + r=(*b).s[l]; + so=(spin_t*)(s+volh); + ro=(spin_t*)(r+volh); + + s[vol]=s0; + r[vol]=s0; + m=(*b).sw; + apply_sw(volh,mu,m,s,r); + tm=tm_parms(); + if (tm.eoflg==1) + mu=0.0f; + + coe=-0.5f; + ceo=-0.5f; + piup=(*b).iup[volh]; + pidn=(*b).idn[volh]; + m+=vol; + u=(*b).u; + um=u+4*vol; + + if ((*b).nbp) + { + ibp=(*b).ibp; + ibm=ibp+(*b).nbp/2; + + for (;ibp=nb)) + { + error_loc(1,1,"Dwee_blk [Dw.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k>=(*b).ns)||(l>=(*b).ns)||((*b).u==NULL)) + { + error_loc(1,1,"Dwee_blk [Dw.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + se=(spin_t*)((*b).s[k]); + re=(spin_t*)((*b).s[l]); + m=(*b).sw; + mm=m+vol; + + if ((*b).nbp) + { + piup=(*b).iup[0]; + pidn=(*b).idn[0]; + + ibu=((cpr[0]==(NPROC0-1))&&(((*b).bo[0]+(*b).bs[0])==L0)&&(bc_type()==0)); + ibd=((cpr[0]==0)&&((*b).bo[0]==0)); + + for (;m=nb)) + { + error_loc(1,1,"Dwoo_blk [Dw.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k>=(*b).ns)||(l>=(*b).ns)||((*b).u==NULL)) + { + error_loc(1,1,"Dwoo_blk [Dw.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + volh=vol/2; + so=(spin_t*)((*b).s[k]+volh); + ro=(spin_t*)((*b).s[l]+volh); + tm=tm_parms(); + if (tm.eoflg==1) + mu=0.0f; + + m=(*b).sw+vol; + mm=m+vol; + + if ((*b).nbp) + { + piup=(*b).iup[volh]; + pidn=(*b).idn[volh]; + + ibu=((cpr[0]==(NPROC0-1))&&(((*b).bo[0]+(*b).bs[0])==L0)&&(bc_type()==0)); + ibd=((cpr[0]==0)&&((*b).bo[0]==0)); + + for (;m=nb)) + { + error_loc(1,1,"Dwoe_blk [Dw.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k>=(*b).ns)||(l>=(*b).ns)||((*b).u==NULL)) + { + error_loc(1,1,"Dwoe_blk [Dw.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + volh=vol/2; + s=(*b).s[k]; + ro=(spin_t*)((*b).s[l]+volh); + s[vol]=s0; + + coe=-0.5f; + piup=(*b).iup[volh]; + pidn=(*b).idn[volh]; + u=(*b).u; + um=u+4*vol; + + if ((*b).nbp) + { + ibp=(*b).ibp; + ibm=ibp+(*b).nbp/2; + + for (;ibp=nb)) + { + error_loc(1,1,"Dweo_blk [Dw.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k>=(*b).ns)||(l>=(*b).ns)||((*b).u==NULL)) + { + error_loc(1,1,"Dweo_blk [Dw.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + volh=vol/2; + so=(spin_t*)((*b).s[k]+volh); + r=(*b).s[l]; + r[vol]=s0; + + ceo=0.5f; + piup=(*b).iup[volh]; + pidn=(*b).idn[volh]; + u=(*b).u; + um=u+4*vol; + + if ((*b).nbp) + { + ibu=((cpr[0]==(NPROC0-1))&&(((*b).bo[0]+(*b).bs[0])==L0)&&(bc_type()==0)); + ibd=((cpr[0]==0)&&((*b).bo[0]==0)); + + for (;u=nb)) + { + error_loc(1,1,"Dwhat_blk [Dw.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k==l)||(k>=(*b).ns)||(l>=(*b).ns)||((*b).u==NULL)) + { + error_loc(1,1,"Dweo_blk [Dw.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + volh=vol/2; + s=(*b).s[k]; + r=(*b).s[l]; + + s[vol]=s0; + r[vol]=s0; + m=(*b).sw; + apply_sw(volh,mu,m,s,r); + + coe=-0.5f; + ceo=0.5f; + piup=(*b).iup[volh]; + pidn=(*b).idn[volh]; + m+=vol; + u=(*b).u; + um=u+4*vol; + + if ((*b).nbp) + { + ibp=(*b).ibp; + ibm=ibp+(*b).nbp/2; + + for (;ibp +#include +#include +#include +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "block.h" +#include "dirac.h" +#include "global.h" + +#if (defined AVX) +#include "avx.h" + +#define _load_cst(c) \ +__asm__ __volatile__ ("vbroadcastss %0, %%ymm15 \n\t" \ + : \ + : \ + "m" (c) \ + : \ + "xmm15") + +#define _mul_cst() \ +__asm__ __volatile__ ("vmulps %%ymm15, %%ymm0, %%ymm0 \n\t" \ + "vmulps %%ymm15, %%ymm1, %%ymm1 \n\t" \ + "vmulps %%ymm15, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +#define _load_zero() \ +__asm__ __volatile__ ("vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" \ + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" \ + "vxorps %%ymm2, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +#define _set_s2zero(s) \ +__asm__ __volatile__ ("vmovaps %%ymm0, %0" \ + : \ + "=m" ((*s).c1.c1), \ + "=m" ((*s).c1.c2), \ + "=m" ((*s).c1.c3), \ + "=m" ((*s).c2.c1)); \ +__asm__ __volatile__ ("vmovaps %%ymm1, %0" \ + : \ + "=m" ((*s).c2.c2), \ + "=m" ((*s).c2.c3), \ + "=m" ((*s).c3.c1), \ + "=m" ((*s).c3.c2)); \ +__asm__ __volatile__ ("vmovaps %%ymm2, %0" \ + : \ + "=m" ((*s).c3.c3), \ + "=m" ((*s).c4.c1), \ + "=m" ((*s).c4.c2), \ + "=m" ((*s).c4.c3)) + +#define _set_w2zero(w) \ +__asm__ __volatile__ ("vmovaps %%ymm0, %0" \ + : \ + "=m" ((w[0]).c1.c1), \ + "=m" ((w[0]).c1.c2), \ + "=m" ((w[0]).c1.c3), \ + "=m" ((w[0]).c2.c1)); \ +__asm__ __volatile__ ("vmovaps %%ymm1, %0" \ + : \ + "=m" ((w[0]).c2.c2), \ + "=m" ((w[0]).c2.c3), \ + "=m" ((w[1]).c1.c1), \ + "=m" ((w[1]).c1.c2)); \ +__asm__ __volatile__ ("vmovaps %%ymm2, %0" \ + : \ + "=m" ((w[1]).c1.c3), \ + "=m" ((w[1]).c2.c1), \ + "=m" ((w[1]).c2.c2), \ + "=m" ((w[1]).c2.c3)) + +#define _weyl_pair_store(rl,rh) \ +__asm__ __volatile__ ("vshufps $0x44, %%ymm4, %%ymm3, %%ymm6 \n\t" \ + "vshufps $0xe4, %%ymm3, %%ymm5, %%ymm7 \n\t" \ + "vshufps $0xee, %%ymm5, %%ymm4, %%ymm8" \ + : \ + : \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vmovaps %%xmm6, %0 \n\t" \ + "vmovaps %%xmm7, %2 \n\t" \ + "vmovaps %%xmm8, %4" \ + : \ + "=m" ((rl).c1.c1), \ + "=m" ((rl).c1.c2), \ + "=m" ((rl).c1.c3), \ + "=m" ((rl).c2.c1), \ + "=m" ((rl).c2.c2), \ + "=m" ((rl).c2.c3)); \ +__asm__ __volatile__ ("vextractf128 $0x1, %%ymm6, %0 \n\t" \ + "vextractf128 $0x1, %%ymm7, %2 \n\t" \ + "vextractf128 $0x1, %%ymm8, %4" \ + : \ + "=m" ((rh).c1.c1), \ + "=m" ((rh).c1.c2), \ + "=m" ((rh).c1.c3), \ + "=m" ((rh).c2.c1), \ + "=m" ((rh).c2.c2), \ + "=m" ((rh).c2.c3)) + + +static void mul_umat(su3 *u) +{ + _avx_su3_pair_multiply(u[0],u[1]); +} + + +static void mul_uinv(su3 *u) +{ + _avx_su3_pair_inverse_multiply(u[0],u[1]); +} + + +void Dw_bnd(blk_grid_t grid,int n,int k,int l) +{ + int bc,nb,isw,*ipp; + float moh; + su3 *u; + weyl *w,*wm; + spinor *s,*sl,*sh; + block_t *b; + bndry_t *bb; + + b=blk_list(grid,&nb,&isw); + + if ((n<0)||(n>=nb)) + { + error_loc(1,1,"Dw_bnd [Dw_bnd.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + b+=n; + bb=(*b).bb; + + if ((k<0)||(k>=(*b).ns)||((*b).u==NULL)||(bb==NULL)||(l>=(*bb).nw)) + { + error_loc(1,1,"Dw_bnd [Dw_bnd.c]", + "Attempt to access unallocated memory space"); + return; + } + + bc=bc_type(); + moh=-0.5f; + _load_cst(moh); + s=(*b).s[k]; + +/********************************** face -0 ***********************************/ + + ipp=(*bb).ipp; + w=(*bb).w[l]; + wm=w+(*bb).vol; + + if ((cpr[0]==0)&&((*b).bo[0]==0)&&(bc!=3)) + { + _load_zero(); + + for (;w=nb)) + { + error_loc(1,1,"Dw_bnd [Dw_bnd.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + b+=n; + bb=(*b).bb; + + if ((k<0)||(k>=(*b).ns)||((*b).u==NULL)||(bb==NULL)||(l>=(*bb).nw)) + { + error_loc(1,1,"Dw_bnd [Dw_bnd.c]", + "Attempt to access unallocated memory space"); + return; + } + + bc=bc_type(); + moh=-0.5f; + _load_cst(moh); + s=(*b).s[k]; + +/********************************** face -0 ***********************************/ + + ipp=(*bb).ipp; + w=(*bb).w[l]; + wm=w+(*bb).vol; + + if ((cpr[0]==0)&&((*b).bo[0]==0)&&(bc!=3)) + { + _load_zero(); + + for (;w=nb)) + { + error_loc(1,1,"Dw_bnd [Dw_bnd.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + b+=n; + bb=(*b).bb; + + if ((k<0)||(k>=(*b).ns)||((*b).u==NULL)||(bb==NULL)||(l>=(*bb).nw)) + { + error_loc(1,1,"Dw_bnd [Dw_bnd.c]", + "Attempt to access unallocated memory space"); + return; + } + + bc=bc_type(); + moh=-0.5f; + s=(*b).s[k]; + +/********************************** face -0 ***********************************/ + + ipp=(*bb).ipp; + w=(*bb).w[l]; + wm=w+(*bb).vol; + + if ((cpr[0]==0)&&((*b).bo[0]==0)&&(bc!=3)) + { + for (;w +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "flags.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +typedef union +{ + spinor_dble s; + weyl_dble w[2]; +} spin_t; + +static double coe,ceo; +static const spinor_dble sd0={{{0.0}}}; +static spin_t rs ALIGNED32; + +#if (defined AVX) +#include "avx.h" + +#define _load_cst(c) \ +__asm__ __volatile__ ("vbroadcastsd %0, %%ymm15 \n\t" \ + : \ + : \ + "m" (c) \ + : \ + "xmm15") + +#define _mul_cst() \ +__asm__ __volatile__ ("vmulpd %%ymm15, %%ymm0, %%ymm0 \n\t" \ + "vmulpd %%ymm15, %%ymm1, %%ymm1 \n\t" \ + "vmulpd %%ymm15, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +#define _mul_cst_up() \ +__asm__ __volatile__ ("vmulpd %%ymm15, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %%ymm15, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %%ymm15, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + + +static void doe(int *piup,int *pidn,su3_dble *u,spinor_dble *pk) +{ + spinor_dble *sp,*sm; + +/******************************* direction +0 *********************************/ + + sp=pk+(*(piup++)); + + _avx_pair_load_dble((*sp).c1,(*sp).c2); + _avx_pair_load_up_dble((*sp).c3,(*sp).c4); + + sm=pk+(*(pidn++)); + _prefetch_spinor_dble(sm); + + _avx_vector_add_dble(); + sp=pk+(*(piup++)); + _prefetch_spinor_dble(sp); + _avx_su3_multiply_pair_dble(*u); + + _avx_weyl_store_up_dble(rs.w[0]); + _avx_weyl_store_up_dble(rs.w[1]); + +/******************************* direction -0 *********************************/ + + _avx_pair_load_dble((*sm).c1,(*sm).c2); + _avx_pair_load_up_dble((*sm).c3,(*sm).c4); + + _avx_vector_sub_dble(); + sm=pk+(*(pidn++)); + _prefetch_spinor_dble(sm); + u+=1; + _avx_su3_inverse_multiply_pair_dble(*u); + + _avx_weyl_load_dble(rs.w[0]); + _avx_vector_add_dble(); + _avx_weyl_store_dble(rs.w[0]); + + _avx_weyl_load_dble(rs.w[1]); + _avx_vector_sub_dble(); + _avx_weyl_store_dble(rs.w[1]); + +/******************************* direction +1 *********************************/ + + _avx_pair_load_dble((*sp).c1,(*sp).c2); + _avx_pair_load_up_dble((*sp).c4,(*sp).c3); + + _avx_vector_i_add_dble(); + sp=pk+(*(piup++)); + _prefetch_spinor_dble(sp); + u+=1; + _avx_su3_multiply_pair_dble(*u); + + _avx_weyl_load_dble(rs.w[0]); + _avx_vector_add_dble(); + _avx_weyl_store_dble(rs.w[0]); + + _avx_weyl_load_dble(rs.w[1]); + _avx_vector_xch_i_sub_dble(); + _avx_weyl_store_dble(rs.w[1]); + +/******************************* direction -1 *********************************/ + + _avx_pair_load_dble((*sm).c1,(*sm).c2); + _avx_pair_load_up_dble((*sm).c4,(*sm).c3); + + _avx_vector_i_sub_dble(); + sm=pk+(*(pidn++)); + _prefetch_spinor_dble(sm); + u+=1; + _avx_su3_inverse_multiply_pair_dble(*u); + + _avx_weyl_load_dble(rs.w[0]); + _avx_vector_add_dble(); + _avx_weyl_store_dble(rs.w[0]); + + _avx_weyl_load_dble(rs.w[1]); + _avx_vector_xch_i_add_dble(); + _avx_weyl_store_dble(rs.w[1]); + +/******************************* direction +2 *********************************/ + + _avx_pair_load_dble((*sp).c1,(*sp).c2); + _avx_pair_load_up_dble((*sp).c4,(*sp).c3); + + _avx_vector_addsub_dble(); + u+=1; + _avx_su3_multiply_pair_dble(*u); + sp=pk+(*(piup)); + _prefetch_spinor_dble(sp); + _avx_weyl_load_dble(rs.w[0]); + _avx_vector_add_dble(); + _avx_weyl_store_dble(rs.w[0]); + + _avx_weyl_load_dble(rs.w[1]); + _avx_vector_xch_dble(); + _avx_vector_subadd_dble(); + _avx_weyl_store_dble(rs.w[1]); + +/******************************* direction -2 *********************************/ + + _avx_pair_load_dble((*sm).c1,(*sm).c2); + _avx_pair_load_up_dble((*sm).c4,(*sm).c3); + + _avx_vector_subadd_dble(); + sm=pk+(*(pidn)); + _prefetch_spinor_dble(sm); + u+=1; + _avx_su3_inverse_multiply_pair_dble(*u); + + _avx_weyl_load_dble(rs.w[0]); + _avx_vector_add_dble(); + _avx_weyl_store_dble(rs.w[0]); + + _avx_weyl_load_dble(rs.w[1]); + _avx_vector_xch_dble(); + _avx_vector_addsub_dble(); + _avx_weyl_store_dble(rs.w[1]); + +/******************************* direction +3 *********************************/ + + _avx_pair_load_dble((*sp).c1,(*sp).c2); + _avx_pair_load_up_dble((*sp).c3,(*sp).c4); + + _avx_vector_i_addsub_dble(); + u+=1; + _avx_su3_multiply_pair_dble(*u); + + _avx_weyl_load_dble(rs.w[0]); + _avx_vector_add_dble(); + _avx_weyl_store_dble(rs.w[0]); + + _avx_weyl_load_dble(rs.w[1]); + _avx_vector_i_subadd_dble(); + _avx_weyl_store_dble(rs.w[1]); + +/******************************* direction -3 *********************************/ + + _avx_pair_load_dble((*sm).c1,(*sm).c2); + _avx_pair_load_up_dble((*sm).c3,(*sm).c4); + + _avx_vector_i_subadd_dble(); + u+=1; + _avx_su3_inverse_multiply_pair_dble(*u); + + _load_cst(coe); + _avx_weyl_load_dble(rs.w[0]); + _avx_vector_add_dble(); + _mul_cst(); + _avx_pair_store_dble(rs.s.c1,rs.s.c2); + + _avx_weyl_load_dble(rs.w[1]); + _avx_vector_i_addsub_dble(); + _mul_cst(); + _avx_pair_store_dble(rs.s.c3,rs.s.c4); + + _avx_zeroupper(); +} + + +static void deo(int *piup,int *pidn,su3_dble *u,spinor_dble *pl) +{ + spinor_dble *sp,*sm; + +/******************************* direction +0 *********************************/ + + sp=pl+(*(piup++)); + _prefetch_spinor_dble(sp); + + _load_cst(ceo); + _avx_pair_load_dble(rs.s.c1,rs.s.c2); + _avx_pair_load_up_dble(rs.s.c3,rs.s.c4); + _mul_cst(); + _mul_cst_up(); + _avx_weyl_store_dble(rs.w[0]); + _avx_weyl_store_up_dble(rs.w[1]); + + sm=pl+(*(pidn++)); + _prefetch_spinor_dble(sm); + _avx_vector_sub_dble(); + _avx_su3_inverse_multiply_pair_dble(*u); + + _avx_pair_load_dble((*sp).c1,(*sp).c2); + _avx_vector_add_dble(); + _avx_pair_store_dble((*sp).c1,(*sp).c2); + + _avx_pair_load_dble((*sp).c3,(*sp).c4); + _avx_vector_sub_dble(); + _avx_pair_store_dble((*sp).c3,(*sp).c4); + +/******************************* direction -0 *********************************/ + + _avx_weyl_load_dble(rs.w[0]); + _avx_weyl_load_up_dble(rs.w[1]); + + sp=pl+(*(piup++)); + _prefetch_spinor_dble(sp); + _avx_vector_add_dble(); + u+=1; + _avx_su3_multiply_pair_dble(*u); + + _avx_pair_load_dble((*sm).c1,(*sm).c2); + _avx_vector_add_dble(); + _avx_pair_store_dble((*sm).c1,(*sm).c2); + + _avx_pair_load_dble((*sm).c3,(*sm).c4); + _avx_vector_add_dble(); + _avx_pair_store_dble((*sm).c3,(*sm).c4); + +/******************************* direction +1 *********************************/ + + _avx_weyl_load_dble(rs.w[0]); + _avx_weyl_load_up_dble(rs.w[1]); + + sm=pl+(*(pidn++)); + _prefetch_spinor_dble(sm); + _avx_vector_xch_i_sub_dble(); + u+=1; + _avx_su3_inverse_multiply_pair_dble(*u); + + _avx_pair_load_dble((*sp).c1,(*sp).c2); + _avx_vector_add_dble(); + _avx_pair_store_dble((*sp).c1,(*sp).c2); + + _avx_pair_load_dble((*sp).c3,(*sp).c4); + _avx_vector_xch_i_add_dble(); + _avx_pair_store_dble((*sp).c3,(*sp).c4); + +/******************************* direction -1 *********************************/ + + _avx_weyl_load_dble(rs.w[0]); + _avx_weyl_load_up_dble(rs.w[1]); + + sp=pl+(*(piup++)); + _prefetch_spinor_dble(sp); + _avx_vector_xch_i_add_dble(); + u+=1; + _avx_su3_multiply_pair_dble(*u); + + _avx_pair_load_dble((*sm).c1,(*sm).c2); + _avx_vector_add_dble(); + _avx_pair_store_dble((*sm).c1,(*sm).c2); + + _avx_pair_load_dble((*sm).c3,(*sm).c4); + _avx_vector_xch_i_sub_dble(); + _avx_pair_store_dble((*sm).c3,(*sm).c4); + +/******************************* direction +2 *********************************/ + + _avx_weyl_load_dble(rs.w[0]); + _avx_weyl_load_up_dble(rs.w[1]); + + sm=pl+(*(pidn++)); + _prefetch_spinor_dble(sm); + _avx_vector_xch_dble(); + _avx_vector_subadd_dble(); + u+=1; + _avx_su3_inverse_multiply_pair_dble(*u); + + _avx_pair_load_dble((*sp).c1,(*sp).c2); + _avx_vector_add_dble(); + _avx_pair_store_dble((*sp).c1,(*sp).c2); + + _avx_pair_load_dble((*sp).c3,(*sp).c4); + _avx_vector_xch_dble(); + _avx_vector_addsub_dble(); + _avx_pair_store_dble((*sp).c3,(*sp).c4); + +/******************************* direction -2 *********************************/ + + _avx_weyl_load_dble(rs.w[0]); + _avx_weyl_load_up_dble(rs.w[1]); + + sp=pl+(*(piup)); + _prefetch_spinor_dble(sp); + _avx_vector_xch_dble(); + _avx_vector_addsub_dble(); + u+=1; + _avx_su3_multiply_pair_dble(*u); + + _avx_pair_load_dble((*sm).c1,(*sm).c2); + _avx_vector_add_dble(); + _avx_pair_store_dble((*sm).c1,(*sm).c2); + + _avx_pair_load_dble((*sm).c3,(*sm).c4); + _avx_vector_xch_dble(); + _avx_vector_subadd_dble(); + _avx_pair_store_dble((*sm).c3,(*sm).c4); + +/******************************* direction +3 *********************************/ + + _avx_weyl_load_dble(rs.w[0]); + _avx_weyl_load_up_dble(rs.w[1]); + + sm=pl+(*(pidn)); + _prefetch_spinor_dble(sm); + _avx_vector_i_subadd_dble(); + u+=1; + _avx_su3_inverse_multiply_pair_dble(*u); + + _avx_pair_load_dble((*sp).c1,(*sp).c2); + _avx_vector_add_dble(); + _avx_pair_store_dble((*sp).c1,(*sp).c2); + + _avx_pair_load_dble((*sp).c3,(*sp).c4); + _avx_vector_i_addsub_dble(); + _avx_pair_store_dble((*sp).c3,(*sp).c4); + +/******************************* direction -3 *********************************/ + + _avx_weyl_load_dble(rs.w[0]); + _avx_weyl_load_up_dble(rs.w[1]); + + _avx_vector_i_addsub_dble(); + u+=1; + _avx_su3_multiply_pair_dble(*u); + + _avx_pair_load_dble((*sm).c1,(*sm).c2); + _avx_vector_add_dble(); + _avx_pair_store_dble((*sm).c1,(*sm).c2); + + _avx_pair_load_dble((*sm).c3,(*sm).c4); + _avx_vector_i_subadd_dble(); + _avx_pair_store_dble((*sm).c3,(*sm).c4); + + _avx_zeroupper(); +} + +#elif (defined x64) +#include "sse2.h" + +#define _load_cst(c) \ +__asm__ __volatile__ ("movddup %0, %%xmm15" \ + : \ + : \ + "m" (c) \ + : \ + "xmm15") + +#define _mul_cst() \ +__asm__ __volatile__ ("mulpd %%xmm15, %%xmm0 \n\t" \ + "mulpd %%xmm15, %%xmm1 \n\t" \ + "mulpd %%xmm15, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +#define _mul_cst_up() \ +__asm__ __volatile__ ("mulpd %%xmm15, %%xmm3 \n\t" \ + "mulpd %%xmm15, %%xmm4 \n\t" \ + "mulpd %%xmm15, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + + +static void doe(int *piup,int *pidn,su3_dble *u,spinor_dble *pk) +{ + spinor_dble *sp,*sm; + +/******************************* direction +0 *********************************/ + + sp=pk+(*(piup++)); + + _sse_load_dble((*sp).c1); + _sse_load_up_dble((*sp).c3); + + sm=pk+(*(pidn++)); + _prefetch_spinor_dble(sm); + _sse_vector_add_dble(); + _sse_su3_multiply_dble(*u); + _sse_store_up_dble(rs.s.c1); + _sse_store_up_dble(rs.s.c3); + + _sse_load_dble((*sp).c2); + _sse_load_up_dble((*sp).c4); + + u+=1; + _prefetch_su3_dble(u); + u-=1; + + _sse_vector_add_dble(); + _sse_su3_multiply_dble(*u); + + _sse_store_up_dble(rs.s.c2); + _sse_store_up_dble(rs.s.c4); + +/******************************* direction -0 *********************************/ + + _sse_load_dble((*sm).c1); + _sse_load_up_dble((*sm).c3); + + sp=pk+(*(piup++)); + _prefetch_spinor_dble(sp); + _sse_vector_sub_dble(); + u+=1; + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble(rs.s.c1); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c1); + + _sse_load_dble(rs.s.c3); + _sse_vector_sub_dble(); + _sse_store_dble(rs.s.c3); + + _sse_load_dble((*sm).c2); + _sse_load_up_dble((*sm).c4); + + u+=1; + _prefetch_su3_dble(u); + u-=1; + + _sse_vector_sub_dble(); + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble(rs.s.c2); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c2); + + _sse_load_dble(rs.s.c4); + _sse_vector_sub_dble(); + _sse_store_dble(rs.s.c4); + +/******************************* direction +1 *********************************/ + + _sse_load_dble((*sp).c1); + _sse_load_up_dble((*sp).c4); + + sm=pk+(*(pidn++)); + _prefetch_spinor_dble(sm); + _sse_vector_i_add_dble(); + u+=1; + _sse_su3_multiply_dble(*u); + + _sse_load_dble(rs.s.c1); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c1); + + _sse_load_dble(rs.s.c4); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_store_dble(rs.s.c4); + + _sse_load_dble((*sp).c2); + _sse_load_up_dble((*sp).c3); + + u+=1; + _prefetch_su3_dble(u); + u-=1; + + _sse_vector_i_add_dble(); + _sse_su3_multiply_dble(*u); + + _sse_load_dble(rs.s.c2); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c2); + + _sse_load_dble(rs.s.c3); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_store_dble(rs.s.c3); + +/******************************* direction -1 *********************************/ + + _sse_load_dble((*sm).c1); + _sse_load_up_dble((*sm).c4); + + sp=pk+(*(piup++)); + _prefetch_spinor_dble(sp); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + u+=1; + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble(rs.s.c1); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c1); + + _sse_load_dble(rs.s.c4); + _sse_vector_i_add_dble(); + _sse_store_dble(rs.s.c4); + + _sse_load_dble((*sm).c2); + _sse_load_up_dble((*sm).c3); + + u+=1; + _prefetch_su3_dble(u); + u-=1; + + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble(rs.s.c2); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c2); + + _sse_load_dble(rs.s.c3); + _sse_vector_i_add_dble(); + _sse_store_dble(rs.s.c3); + +/******************************* direction +2 *********************************/ + + _sse_load_dble((*sp).c1); + _sse_load_up_dble((*sp).c4); + + sm=pk+(*(pidn++)); + _prefetch_spinor_dble(sm); + _sse_vector_add_dble(); + u+=1; + _sse_su3_multiply_dble(*u); + + _sse_load_dble(rs.s.c1); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c1); + + _sse_load_dble(rs.s.c4); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c4); + + _sse_load_dble((*sp).c2); + _sse_load_up_dble((*sp).c3); + + u+=1; + _prefetch_su3_dble(u); + u-=1; + + _sse_vector_sub_dble(); + _sse_su3_multiply_dble(*u); + + _sse_load_dble(rs.s.c2); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c2); + + _sse_load_dble(rs.s.c3); + _sse_vector_sub_dble(); + _sse_store_dble(rs.s.c3); + +/******************************* direction -2 *********************************/ + + _sse_load_dble((*sm).c1); + _sse_load_up_dble((*sm).c4); + + sp=pk+(*(piup)); + _prefetch_spinor_dble(sp); + _sse_vector_sub_dble(); + u+=1; + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble(rs.s.c1); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c1); + + _sse_load_dble(rs.s.c4); + _sse_vector_sub_dble(); + _sse_store_dble(rs.s.c4); + + _sse_load_dble((*sm).c2); + _sse_load_up_dble((*sm).c3); + + u+=1; + _prefetch_su3_dble(u); + u-=1; + + _sse_vector_add_dble(); + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble(rs.s.c2); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c2); + + _sse_load_dble(rs.s.c3); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c3); + +/******************************* direction +3 *********************************/ + + _sse_load_dble((*sp).c1); + _sse_load_up_dble((*sp).c3); + + sm=pk+(*(pidn)); + _prefetch_spinor_dble(sm); + _sse_vector_i_add_dble(); + u+=1; + _sse_su3_multiply_dble(*u); + + _sse_load_dble(rs.s.c1); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c1); + + _sse_load_dble(rs.s.c3); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_store_dble(rs.s.c3); + + _sse_load_dble((*sp).c2); + _sse_load_up_dble((*sp).c4); + + u+=1; + _prefetch_su3_dble(u); + u-=1; + + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_su3_multiply_dble(*u); + + _sse_load_dble(rs.s.c2); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c2); + + _sse_load_dble(rs.s.c4); + _sse_vector_i_add_dble(); + _sse_store_dble(rs.s.c4); + +/******************************* direction -3 *********************************/ + + _sse_load_dble((*sm).c1); + _sse_load_up_dble((*sm).c3); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + u+=1; + _sse_su3_inverse_multiply_dble(*u); + + _load_cst(coe); + _sse_load_dble(rs.s.c1); + _sse_vector_add_dble(); + _mul_cst(); + _sse_store_dble(rs.s.c1); + + _sse_load_dble(rs.s.c3); + _sse_vector_i_add_dble(); + _mul_cst(); + _sse_store_dble(rs.s.c3); + + _sse_load_dble((*sm).c2); + _sse_load_up_dble((*sm).c4); + + u+=1; + _prefetch_su3_dble(u); + u-=1; + + _sse_vector_i_add_dble(); + _sse_su3_inverse_multiply_dble(*u); + + _load_cst(coe); + _sse_load_dble(rs.s.c2); + _sse_vector_add_dble(); + _mul_cst(); + _sse_store_dble(rs.s.c2); + + _sse_load_dble(rs.s.c4); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _mul_cst(); + _sse_store_dble(rs.s.c4); +} + + +static void deo(int *piup,int *pidn,su3_dble *u,spinor_dble *pl) +{ + spinor_dble *sp,*sm; + +/******************************* direction +0 *********************************/ + + sp=pl+(*(piup++)); + _prefetch_spinor_dble(sp); + + _load_cst(ceo); + _sse_load_dble(rs.s.c1); + _sse_load_up_dble(rs.s.c3); + _mul_cst(); + _mul_cst_up(); + _sse_store_dble(rs.s.c1); + _sse_store_up_dble(rs.s.c3); + + sm=pl+(*(pidn++)); + _prefetch_spinor_dble(sm); + _sse_vector_sub_dble(); + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble((*sp).c1); + _sse_vector_add_dble(); + _sse_store_dble((*sp).c1); + + _sse_load_dble((*sp).c3); + _sse_vector_sub_dble(); + _sse_store_dble((*sp).c3); + + _load_cst(ceo); + _sse_load_dble(rs.s.c2); + _sse_load_up_dble(rs.s.c4); + _mul_cst(); + _mul_cst_up(); + _sse_store_dble(rs.s.c2); + _sse_store_up_dble(rs.s.c4); + + _sse_vector_sub_dble(); + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble((*sp).c2); + _sse_vector_add_dble(); + _sse_store_dble((*sp).c2); + + _sse_load_dble((*sp).c4); + _sse_vector_sub_dble(); + _sse_store_dble((*sp).c4); + +/******************************* direction -0 *********************************/ + + _sse_load_dble(rs.s.c1); + _sse_load_up_dble(rs.s.c3); + + sp=pl+(*(piup++)); + _prefetch_spinor_dble(sp); + _sse_vector_add_dble(); + u+=1; + _sse_su3_multiply_dble(*u); + + _sse_load_dble((*sm).c1); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c1); + + _sse_load_dble((*sm).c3); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c3); + + _sse_load_dble(rs.s.c2); + _sse_load_up_dble(rs.s.c4); + + _sse_vector_add_dble(); + _sse_su3_multiply_dble(*u); + + _sse_load_dble((*sm).c2); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c2); + + _sse_load_dble((*sm).c4); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c4); + +/******************************* direction +1 *********************************/ + + _sse_load_dble(rs.s.c1); + _sse_load_up_dble(rs.s.c4); + + sm=pl+(*(pidn++)); + _prefetch_spinor_dble(sm); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + u+=1; + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble((*sp).c1); + _sse_vector_add_dble(); + _sse_store_dble((*sp).c1); + + _sse_load_dble((*sp).c4); + _sse_vector_i_add_dble(); + _sse_store_dble((*sp).c4); + + _sse_load_dble(rs.s.c2); + _sse_load_up_dble(rs.s.c3); + + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble((*sp).c2); + _sse_vector_add_dble(); + _sse_store_dble((*sp).c2); + + _sse_load_dble((*sp).c3); + _sse_vector_i_add_dble(); + _sse_store_dble((*sp).c3); + +/******************************* direction -1 *********************************/ + + _sse_load_dble(rs.s.c1); + _sse_load_up_dble(rs.s.c4); + + sp=pl+(*(piup++)); + _prefetch_spinor_dble(sp); + _sse_vector_i_add_dble(); + u+=1; + _sse_su3_multiply_dble(*u); + + _sse_load_dble((*sm).c1); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c1); + + _sse_load_dble((*sm).c4); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_store_dble((*sm).c4); + + _sse_load_dble(rs.s.c2); + _sse_load_up_dble(rs.s.c3); + + _sse_vector_i_add_dble(); + _sse_su3_multiply_dble(*u); + + _sse_load_dble((*sm).c2); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c2); + + _sse_load_dble((*sm).c3); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_store_dble((*sm).c3); + +/******************************* direction +2 *********************************/ + + _sse_load_dble(rs.s.c1); + _sse_load_up_dble(rs.s.c4); + + sm=pl+(*(pidn++)); + _prefetch_spinor_dble(sm); + _sse_vector_sub_dble(); + u+=1; + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble((*sp).c1); + _sse_vector_add_dble(); + _sse_store_dble((*sp).c1); + + _sse_load_dble((*sp).c4); + _sse_vector_sub_dble(); + _sse_store_dble((*sp).c4); + + _sse_load_dble(rs.s.c2); + _sse_load_up_dble(rs.s.c3); + + _sse_vector_add_dble(); + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble((*sp).c2); + _sse_vector_add_dble(); + _sse_store_dble((*sp).c2); + + _sse_load_dble((*sp).c3); + _sse_vector_add_dble(); + _sse_store_dble((*sp).c3); + +/******************************* direction -2 *********************************/ + + _sse_load_dble(rs.s.c1); + _sse_load_up_dble(rs.s.c4); + + sp=pl+(*(piup)); + _prefetch_spinor_dble(sp); + _sse_vector_add_dble(); + u+=1; + _sse_su3_multiply_dble(*u); + + _sse_load_dble((*sm).c1); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c1); + + _sse_load_dble((*sm).c4); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c4); + + _sse_load_dble(rs.s.c2); + _sse_load_up_dble(rs.s.c3); + + _sse_vector_sub_dble(); + _sse_su3_multiply_dble(*u); + + _sse_load_dble((*sm).c2); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c2); + + _sse_load_dble((*sm).c3); + _sse_vector_sub_dble(); + _sse_store_dble((*sm).c3); + +/******************************* direction +3 *********************************/ + + _sse_load_dble(rs.s.c1); + _sse_load_up_dble(rs.s.c3); + + sm=pl+(*(pidn)); + _prefetch_spinor_dble(sm); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + u+=1; + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble((*sp).c1); + _sse_vector_add_dble(); + _sse_store_dble((*sp).c1); + + _sse_load_dble((*sp).c3); + _sse_vector_i_add_dble(); + _sse_store_dble((*sp).c3); + + _sse_load_dble(rs.s.c2); + _sse_load_up_dble(rs.s.c4); + + _sse_vector_i_add_dble(); + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble((*sp).c2); + _sse_vector_add_dble(); + _sse_store_dble((*sp).c2); + + _sse_load_dble((*sp).c4); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_store_dble((*sp).c4); + +/******************************* direction -3 *********************************/ + + _sse_load_dble(rs.s.c1); + _sse_load_up_dble(rs.s.c3); + + _sse_vector_i_add_dble(); + u+=1; + _sse_su3_multiply_dble(*u); + + _sse_load_dble((*sm).c1); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c1); + + _sse_load_dble((*sm).c3); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_store_dble((*sm).c3); + + _sse_load_dble(rs.s.c2); + _sse_load_up_dble(rs.s.c4); + + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_su3_multiply_dble(*u); + + _sse_load_dble((*sm).c2); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c2); + + _sse_load_dble((*sm).c4); + _sse_vector_i_add_dble(); + _sse_store_dble((*sm).c4); +} + +#else + +#define _vector_mul_assign(r,c) \ + (r).c1.re*=(c); \ + (r).c1.im*=(c); \ + (r).c2.re*=(c); \ + (r).c2.im*=(c); \ + (r).c3.re*=(c); \ + (r).c3.im*=(c) + + +static void doe(int *piup,int *pidn,su3_dble *u,spinor_dble *pk) +{ + spinor_dble *sp,*sm; + su3_vector_dble psi,chi; + +/******************************* direction +0 *********************************/ + + sp=pk+(*(piup++)); + + _vector_add(psi,(*sp).c1,(*sp).c3); + _su3_multiply(rs.s.c1,*u,psi); + rs.s.c3=rs.s.c1; + + _vector_add(psi,(*sp).c2,(*sp).c4); + _su3_multiply(rs.s.c2,*u,psi); + rs.s.c4=rs.s.c2; + +/******************************* direction -0 *********************************/ + + sm=pk+(*(pidn++)); + u+=1; + + _vector_sub(psi,(*sm).c1,(*sm).c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_sub_assign(rs.s.c3,chi); + + _vector_sub(psi,(*sm).c2,(*sm).c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_sub_assign(rs.s.c4,chi); + +/******************************* direction +1 *********************************/ + + sp=pk+(*(piup++)); + u+=1; + + _vector_i_add(psi,(*sp).c1,(*sp).c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_i_sub_assign(rs.s.c4,chi); + + _vector_i_add(psi,(*sp).c2,(*sp).c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_i_sub_assign(rs.s.c3,chi); + +/******************************* direction -1 *********************************/ + + sm=pk+(*(pidn++)); + u+=1; + + _vector_i_sub(psi,(*sm).c1,(*sm).c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_i_add_assign(rs.s.c4,chi); + + _vector_i_sub(psi,(*sm).c2,(*sm).c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_i_add_assign(rs.s.c3,chi); + +/******************************* direction +2 *********************************/ + + sp=pk+(*(piup++)); + u+=1; + + _vector_add(psi,(*sp).c1,(*sp).c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_add_assign(rs.s.c4,chi); + + _vector_sub(psi,(*sp).c2,(*sp).c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_sub_assign(rs.s.c3,chi); + +/******************************* direction -2 *********************************/ + + sm=pk+(*(pidn++)); + u+=1; + + _vector_sub(psi,(*sm).c1,(*sm).c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_sub_assign(rs.s.c4,chi); + + _vector_add(psi,(*sm).c2,(*sm).c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_add_assign(rs.s.c3,chi); + +/******************************* direction +3 *********************************/ + + sp=pk+(*(piup)); + u+=1; + + _vector_i_add(psi,(*sp).c1,(*sp).c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_i_sub_assign(rs.s.c3,chi); + + _vector_i_sub(psi,(*sp).c2,(*sp).c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_i_add_assign(rs.s.c4,chi); + +/******************************* direction -3 *********************************/ + + sm=pk+(*(pidn)); + u+=1; + + _vector_i_sub(psi,(*sm).c1,(*sm).c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_i_add_assign(rs.s.c3,chi); + + _vector_i_add(psi,(*sm).c2,(*sm).c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_i_sub_assign(rs.s.c4,chi); + + _vector_mul_assign(rs.s.c1,coe); + _vector_mul_assign(rs.s.c2,coe); + _vector_mul_assign(rs.s.c3,coe); + _vector_mul_assign(rs.s.c4,coe); +} + + +static void deo(int *piup,int *pidn,su3_dble *u,spinor_dble *pl) +{ + spinor_dble *sp,*sm; + su3_vector_dble psi,chi; + + _vector_mul_assign(rs.s.c1,ceo); + _vector_mul_assign(rs.s.c2,ceo); + _vector_mul_assign(rs.s.c3,ceo); + _vector_mul_assign(rs.s.c4,ceo); + +/******************************* direction +0 *********************************/ + + sp=pl+(*(piup++)); + + _vector_sub(psi,rs.s.c1,rs.s.c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c1,chi); + _vector_sub_assign((*sp).c3,chi); + + _vector_sub(psi,rs.s.c2,rs.s.c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c2,chi); + _vector_sub_assign((*sp).c4,chi); + +/******************************* direction -0 *********************************/ + + sm=pl+(*(pidn++)); + u+=1; + + _vector_add(psi,rs.s.c1,rs.s.c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c1,chi); + _vector_add_assign((*sm).c3,chi); + + _vector_add(psi,rs.s.c2,rs.s.c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c2,chi); + _vector_add_assign((*sm).c4,chi); + +/******************************* direction +1 *********************************/ + + sp=pl+(*(piup++)); + u+=1; + + _vector_i_sub(psi,rs.s.c1,rs.s.c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c1,chi); + _vector_i_add_assign((*sp).c4,chi); + + _vector_i_sub(psi,rs.s.c2,rs.s.c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c2,chi); + _vector_i_add_assign((*sp).c3,chi); + +/******************************* direction -1 *********************************/ + + sm=pl+(*(pidn++)); + u+=1; + + _vector_i_add(psi,rs.s.c1,rs.s.c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c1,chi); + _vector_i_sub_assign((*sm).c4,chi); + + _vector_i_add(psi,rs.s.c2,rs.s.c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c2,chi); + _vector_i_sub_assign((*sm).c3,chi); + +/******************************* direction +2 *********************************/ + + sp=pl+(*(piup++)); + u+=1; + + _vector_sub(psi,rs.s.c1,rs.s.c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c1,chi); + _vector_sub_assign((*sp).c4,chi); + + _vector_add(psi,rs.s.c2,rs.s.c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c2,chi); + _vector_add_assign((*sp).c3,chi); + +/******************************* direction -2 *********************************/ + + sm=pl+(*(pidn++)); + u+=1; + + _vector_add(psi,rs.s.c1,rs.s.c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c1,chi); + _vector_add_assign((*sm).c4,chi); + + _vector_sub(psi,rs.s.c2,rs.s.c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c2,chi); + _vector_sub_assign((*sm).c3,chi); + +/******************************* direction +3 *********************************/ + + sp=pl+(*(piup)); + u+=1; + + _vector_i_sub(psi,rs.s.c1,rs.s.c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c1,chi); + _vector_i_add_assign((*sp).c3,chi); + + _vector_i_add(psi,rs.s.c2,rs.s.c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c2,chi); + _vector_i_sub_assign((*sp).c4,chi); + +/******************************* direction -3 *********************************/ + + sm=pl+(*(pidn)); + u+=1; + + _vector_i_add(psi,rs.s.c1,rs.s.c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c1,chi); + _vector_i_sub_assign((*sm).c3,chi); + + _vector_i_sub(psi,rs.s.c2,rs.s.c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c2,chi); + _vector_i_add_assign((*sm).c4,chi); +} + +#endif + +void Dw_dble(double mu,spinor_dble *s,spinor_dble *r) +{ + int bc,ix,t; + int *piup,*pidn; + su3_dble *u,*um; + pauli_dble *m; + spin_t *so,*ro; + tm_parms_t tm; + + cpsd_int_bnd(0x1,s); + m=swdfld(); + apply_sw_dble(VOLUME/2,mu,m,s,r); + set_sd2zero(BNDRY/2,r+VOLUME); + tm=tm_parms(); + if (tm.eoflg==1) + mu=0.0; + + coe=-0.5; + ceo=-0.5; + bc=bc_type(); + piup=iup[VOLUME/2]; + pidn=idn[VOLUME/2]; + + so=(spin_t*)(s+(VOLUME/2)); + ro=(spin_t*)(r+(VOLUME/2)); + m+=VOLUME; + u=udfld(); + um=u+4*VOLUME; + + if (((cpr[0]==0)&&(bc!=3))||((cpr[0]==(NPROC0-1))&&(bc==0))) + { + ix=VOLUME/2; + + for (;u0)&&((t<(N0-1))||(bc!=0))) + { + doe(piup,pidn,u,s); + + mul_pauli_dble(mu,m,(*so).w,(*ro).w); + mul_pauli_dble(-mu,m+1,(*so).w+1,(*ro).w+1); + + _vector_add_assign((*ro).s.c1,rs.s.c1); + _vector_add_assign((*ro).s.c2,rs.s.c2); + _vector_add_assign((*ro).s.c3,rs.s.c3); + _vector_add_assign((*ro).s.c4,rs.s.c4); + rs=(*so); + + deo(piup,pidn,u,r); + } + else + { + (*so).s=sd0; + (*ro).s=sd0; + } + + piup+=4; + pidn+=4; + so+=1; + ro+=1; + m+=2; + } + } + else + { + for (;u0)&&((t<(N0-1))||(bc!=0))) + { + mul_pauli_dble(mu,m,(*se).w,(*re).w); + mul_pauli_dble(-mu,m+1,(*se).w+1,(*re).w+1); + } + else + { + (*se).s=sd0; + (*re).s=sd0; + } + + se+=1; + re+=1; + } + } + else + { + for (;m0)&&((t<(N0-1))||(bc!=0))) + { + mul_pauli_dble(mu,m,(*so).w,(*ro).w); + mul_pauli_dble(-mu,m+1,(*so).w+1,(*ro).w+1); + } + else + { + (*so).s=sd0; + (*ro).s=sd0; + } + + so+=1; + ro+=1; + } + } + else + { + for (;m0)&&((t<(N0-1))||(bc!=0))) + { + doe(piup,pidn,u,s); + (*ro)=rs; + } + else + (*ro).s=sd0; + + piup+=4; + pidn+=4; + ro+=1; + } + } + else + { + for (;u0)&&((t<(N0-1))||(bc!=0))) + { + rs=(*so); + deo(piup,pidn,u,r); + } + else + (*so).s=sd0; + + piup+=4; + pidn+=4; + so+=1; + } + } + else + { + for (;u0)&&((t<(N0-1))||(bc!=0))) + { + doe(piup,pidn,u,s); + + mul_pauli_dble(0.0,m,rs.w,rs.w); + mul_pauli_dble(0.0,m+1,rs.w+1,rs.w+1); + + deo(piup,pidn,u,r); + } + + piup+=4; + pidn+=4; + m+=2; + } + } + else + { + for (;u=nb)) + { + error_loc(1,1,"Dw_blk_dble [Dw_dble.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k==l)||(k>=(*b).nsd)||(l>=(*b).nsd)||((*b).ud==NULL)) + { + error_loc(1,1,"Dw_blk_dbl [Dw_dble.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + volh=vol/2; + s=(*b).sd[k]; + r=(*b).sd[l]; + so=(spin_t*)(s+volh); + ro=(spin_t*)(r+volh); + + s[vol]=sd0; + r[vol]=sd0; + m=(*b).swd; + apply_sw_dble(volh,mu,m,s,r); + tm=tm_parms(); + if (tm.eoflg==1) + mu=0.0; + + coe=-0.5; + ceo=-0.5; + piup=(*b).iup[volh]; + pidn=(*b).idn[volh]; + m+=vol; + u=(*b).ud; + um=u+4*vol; + + if ((*b).nbp) + { + ibp=(*b).ibp; + ibm=ibp+(*b).nbp/2; + + for (;ibp=nb)) + { + error_loc(1,1,"Dwee_blk_dble [Dw_dble.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k>=(*b).nsd)||(l>=(*b).nsd)||((*b).ud==NULL)) + { + error_loc(1,1,"Dwee_blk_dbl [Dw_dble.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + se=(spin_t*)((*b).sd[k]); + re=(spin_t*)((*b).sd[l]); + m=(*b).swd; + mm=m+vol; + + if ((*b).nbp) + { + piup=(*b).iup[0]; + pidn=(*b).idn[0]; + + ibu=((cpr[0]==(NPROC0-1))&&(((*b).bo[0]+(*b).bs[0])==L0)&&(bc_type()==0)); + ibd=((cpr[0]==0)&&((*b).bo[0]==0)); + + for (;m=nb)) + { + error_loc(1,1,"Dwoo_blk_dble [Dw_dble.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k>=(*b).nsd)||(l>=(*b).nsd)||((*b).ud==NULL)) + { + error_loc(1,1,"Dwoo_blk_dbl [Dw_dble.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + volh=vol/2; + so=(spin_t*)((*b).sd[k]+volh); + ro=(spin_t*)((*b).sd[l]+volh); + tm=tm_parms(); + if (tm.eoflg==1) + mu=0.0; + + m=(*b).swd+vol; + mm=m+vol; + + if ((*b).nbp) + { + piup=(*b).iup[volh]; + pidn=(*b).idn[volh]; + + ibu=((cpr[0]==(NPROC0-1))&&(((*b).bo[0]+(*b).bs[0])==L0)&&(bc_type()==0)); + ibd=((cpr[0]==0)&&((*b).bo[0]==0)); + + for (;m=nb)) + { + error_loc(1,1,"Dwoe_blk_dble [Dw_dble.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k>=(*b).nsd)||(l>=(*b).nsd)||((*b).ud==NULL)) + { + error_loc(1,1,"Dwoe_blk_dbl [Dw_dble.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + volh=vol/2; + s=(*b).sd[k]; + ro=(spin_t*)((*b).sd[l]+volh); + s[vol]=sd0; + + coe=-0.5; + piup=(*b).iup[volh]; + pidn=(*b).idn[volh]; + u=(*b).ud; + um=u+4*vol; + + if ((*b).nbp) + { + ibp=(*b).ibp; + ibm=ibp+(*b).nbp/2; + + for (;ibp=nb)) + { + error_loc(1,1,"Dweo_blk_dble [Dw_dble.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k>=(*b).nsd)||(l>=(*b).nsd)||((*b).ud==NULL)) + { + error_loc(1,1,"Dweo_blk_dbl [Dw_dble.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + volh=vol/2; + so=(spin_t*)((*b).sd[k]+volh); + r=(*b).sd[l]; + r[vol]=sd0; + + ceo=0.5; + piup=(*b).iup[volh]; + pidn=(*b).idn[volh]; + u=(*b).ud; + um=u+4*vol; + + if ((*b).nbp) + { + ibu=((cpr[0]==(NPROC0-1))&&(((*b).bo[0]+(*b).bs[0])==L0)&&(bc_type()==0)); + ibd=((cpr[0]==0)&&((*b).bo[0]==0)); + + for (;u=nb)) + { + error_loc(1,1,"Dwhat_blk_dble [Dw_dble.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k==l)||(k>=(*b).nsd)||(l>=(*b).nsd)||((*b).ud==NULL)) + { + error_loc(1,1,"Dwhat_blk_dbl [Dw_dble.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + volh=vol/2; + s=(*b).sd[k]; + r=(*b).sd[l]; + + s[vol]=sd0; + r[vol]=sd0; + m=(*b).swd; + apply_sw_dble(volh,mu,m,s,r); + + coe=-0.5; + ceo=0.5; + piup=(*b).iup[volh]; + pidn=(*b).idn[volh]; + m+=vol; + u=(*b).ud; + um=u+4*vol; + + if ((*b).nbp) + { + ibp=(*b).ibp; + ibm=ibp+(*b).nbp/2; + + for (;ibp]" (after any number of blanks), where is + the integer value passed by the argument. An error occurs if no such + line or more than one is found. The lines + + action + ipf + im0 + irat + imu [] + isp [] + + are then read using read_line() [utils/mutils.c]. Depending on the + value of "action", some lines are not read and can be omitted in + the input file. The number of integer items on the lines with tag + "imu" and "isp" depends on the action too. The data are then added + to the data base by calling set_action_parms(iact,...). + +void print_action_parms(void) + Prints the parameters of the defined actions to stdout on MPI + process 0. + +void write_action_parms(FILE *fdat) + Writes the parameters of the defined actions to the file fdat on + MPI process 0. + +void check_action_parms(FILE *fdat) + Compares the parameters of the defined actions with those stored + on the file fdat on MPI process 0, assuming the latter were written + to the file by the program write_action_parms(). + +dfl_parms_t set_dfl_parms(int *bs,int Ns) + Sets the parameters of the deflation subspace. The parameters are + + bs[4] Sizes of the blocks in DFL_BLOCKS block grid. + + Ns Number of deflation modes per block (must be + even and non-zero). + + The return value is a structure that contains the above parameters. + Note that these parameters can only be set once. + +dfl_parms_t dfl_parms(void) + Returns the parameters currently set for the deflation subspace. + +dfl_pro_parms_t set_dfl_pro_parms(int nkv,int nmx,double res) + Sets the parameters used when applying the deflation projection in the + deflated solver program dfl_sap_gcr(). The parameters are + + nkv Maximal number of Krylov vectors to be used by the + solver for the little Dirac equation before a restart. + + nmx Maximal total number of Krylov vectors generated by + the solver for the little Dirac equation. + + res Required relative residue when solving the little + Dirac equation. + + The return value is a structure that contains the above parameters. + +dfl_pro_parms_t dfl_pro_parms(void) + Returns the parameters currently set for the deflation projectors in + the deflated solver program dfl_sap_gcr(). + +dfl_gen_parms_t set_dfl_gen_parms(double kappa,double mu, + int ninv,int nmr,int ncy, + int nkv,int nmx,double res) + Sets the parameters of the inverse iteration procedure that generates + the deflation subspace. The parameters are + + kappa Hopping parameter of the Dirac operator. + + mu Twisted mass parameter. + + ninv Total number of inverse iteration steps (ninv>=4). + + nmr Number of block minimal residual iterations to be + used when the SAP smoother is applied. + + ncy Number of SAP cycles per inverse iteration. + + The return value is a structure that contains the above parameters and + the bare mass m0 that corresponds to the hopping parameter kappa. + +dfl_gen_parms_t dfl_gen_parms(void) + Returns the parameters currently set for the generation of the deflation + subspace plus the corresponding bare mass m0. + +dfl_upd_parms_t set_dfl_upd_parms(double dtau,int nsm) + Sets the parameters of the deflation subspace update scheme. The + parameters are + + dtau Molecular-dynamics time separation between + updates of the deflation subspace. + + nsm Number of deflated smoothing interations to be + applied when the subspace is updated. + + The return value is a structure that contains the above parameters. + +dfl_upd_parms_t dfl_upd_parms(void) + Returns the parameters currently set for the deflation subspace + update scheme. + +void print_dfl_parms(int ipr) + Prints the parameters of the deflation subspace, the projectors, the + subspace generation algorithm and the update scheme to stdout on MPI + process 0. The update scheme is omitted if ipr=0. + +void write_dfl_parms(FILE *fdat) + Writes the parameters of the deflation subspace, the projectors, the + subspace generation algorithm and the update scheme to the file fdat + on MPI process 0. + +void check_dfl_parms(FILE *fdat) + Compares the parameters of the deflation subspace, the projectors the + subspace generation algorithm and the update scheme with the values + stored on the file fdat on MPI process 0, assuming the latter were + written to the file by the program write_dfl_parms() (mismatches of + maximal solver iteration numbers are not considered to be an error). + +void set_flags(event_t event) + Reports an event to the data base, which changed the global gauge + gauge or SW fields. + +void set_grid_flags(blk_grid_t grid,event_t event) + Reports an event to the data base, which changed the gauge or SW + fields on the specified block grid. + +int query_flags(query_t query) + Queries the data base on the status of the global gauge or SW + fields. The program returns 1 or 0 depending on whether the answer + to the specified query is "yes" or "no". If the query is unknown to + the data base, the program returns -1. + +int query_grid_flags(blk_grid_t grid,query_t query) + Queries the data base on the status of the gauge or SW fields on + the specified block grid. The program returns 1 or 0 depending on + whether the answer to the specified query is "yes" or "no". If the + query is unknown to the data base, the program returns -1. + +void print_flags(void) + Prints the current values of all flags related to the global gauge + and SW fields to stdout from process 0. + +void print_grid_flags(blk_grid_t grid) + Prints the current values of all flags related to the gauge and SW + fields on the specified block grid to stdout from process 0. + +force_parms_t set_force_parms(int ifr,force_t force,int ipf,int im0, + int *irat,int *imu,int *isp,int *ncr) + Sets the parameters in the force parameter set number ifr and returns + a structure containing them (see the notes). + +force_parms_t force_parms(int ifr) + Returns a structure containing the force parameter set number ifr + (see the notes). + +void read_force_parms(int ifr) + On process 0, this program scans stdin for a line starting with the + string "[Force ]" (after any number of blanks), where is + the integer value passed by the argument. An error occurs if no such + line or more than one is found. The lines + + force + ipf + im0 + irat + imu [] + isp [] + ncr [] + + are then read using read_line() [utils/mutils.c]. Depending on the + value of "force", some lines are not read and can be omitted in the + input file. The number of integer items on the lines with tag "imu" + and "isp" and "ncr" depends on the force too. The data are then added + to the data base by calling set_force_parms(ifr,...). + +void read_force_parms2(int ifr) + Same as read_force_parms() except that only the lines + + force + isp [] + ncr [] + + are read from stdin. All other force parameters are inferred from + the parameters of the action no ifr so that the force is the one + deriving from that action. An error occurs if the parameters of the + action no ifr have not previously been added to the data base or + if the force and action types do not match. + +void print_force_parms(void) + Prints the parameters of the defined forces to stdout on MPI + process 0. + +void print_force_parms2(void) + Prints the parameters of the defined forces to stdout on MPI + process 0 in a short format corresponding to read_force_parms2(). + +void write_force_parms(FILE *fdat) + Writes the parameters of the defined forces to the file fdat on + MPI process 0. + +void check_force_parms(FILE *fdat) + Compares the parameters of the defined forces with those stored + on the file fdat on MPI process 0, assuming the latter were written + to the file by the program write_force_parms(). + +hmc_parms_t set_hmc_parms(int nact,int *iact,int npf,int nmu, + double *mu,int nlv,double tau) + Sets some basic parameters of the HMC algorithm. The parameters are + + nact Number of terms in the total action + + iact Indices iact[i] of the action terms (i=0,..,nact-1) + + npf Number of pseudo-fermion fields on which the action + depends + + nmu Number of twisted mass parameters on which the + pseudo-fermion actions and forces depend + + mu Twisted masses mu[i] (i=0,..,nmu-1) + + nlv Number of levels of the molecular-dynamics integrator + + tau Molecular-dynamics trajectory length + + The total action must include the gauge action, but pseudo-fermion + actions are optional and the momentum action is treated separately. + The program returns a structure that contains the parameters listed + above. + +hmc_parms_t hmc_parms(void) + Returns a structure containing the current values of the parameters + listed above. + +void print_hmc_parms(void) + Prints the lattice parameters to stdout on MPI process 0. + +lat_parms_t set_lat_parms(double beta,double c0, + double kappa_u,double kappa_s,double kappa_c, + double csw,double cG,double cF) + Sets the basic lattice parameters. The parameters are + + beta Inverse bare coupling (beta=6/g_0^2). + + c0 Coefficient of the plaquette loops in the gauge + action (see doc/gauge_action.pdf). + + kappa_{u,s,c} Hopping parameters of the u, s and c sea quarks. The + u and the d quark have the same hopping parameter and + quarks with vanishing hopping parameter are ignored. + + csw Coefficient of the Sheikholeslami-Wohlert term. + + cG,cF Coefficients of the gauge and fermion O(a) boundary + counterterms. + + The return value is a structure that contains the lattice parameters + and the associated bare quark masses m0u, m0s and m0c. + +lat_parms_t lat_parms(void) + Returns the current lattice parameters in a structure that contains + the above parameters plus the bare quark masses. + +void print_lat_parms(void) + Prints the lattice parameters to stdout on MPI process 0. + +bc_parms_t set_bc_parms(int type, + double cG,double cG_prime, + double cF,double cF_prime, + double *phi,double *phi_prime) + Sets the boundary conditions and the associated parameters of the + action. The parameters are + + type Chosen type of boundary condition (0: open, 1: SF, + 2: open-SF, 3: periodic). + + cG,cG_prime Gauge action improvement coefficients at time 0 + and T, respectively. + + cF,cF_prime Fermion action improvement coefficients at time 0 + and T, respectively. + + phi[0], First two angles that define the boundary values of + phi[1] the gauge field at time 0. + + phi_prime[0], First two angles that define the boundary values of + phi_prime[1] the gauge field at time T. + + The return value is a structure that contains these parameters plus + the third angles. In this structure, the improvement coefficients and + the angles are stored in the form of arrays cG[2],cF[2] and phi[2][3], + where cG[0],cF[0],phi[0][3] and cG[1],cF[1],phi[1][3] are the para- + meters at time 0 and T, respectively + Parameters that are not required for the specification of the chosen + boundary conditions are not read and are set to their default values + in the data base (angles to 0, improvement coefficients to 1). In the + case of SF boundary conditions (type 1), the program only reads cG,cF + and the angles phi,phi_prime and then sets cG_prime=cG,cF_prime=cF. + When open-SF boundary conditions are chosen, all parameters except for + the angles phi are read. + +bc_parms_t bc_parms(void) + Returns a structure that contains the boundary parameters. + +void print_bc_parms(void) + Prints the boundary parameters to stdout on MPI process 0. + +void write_bc_parms(FILE *fdat) + Writes the boundary parameters to the file fdat on MPI process 0. + +void check_bc_parms(FILE *fdat) + Compares the currently set boundary parameters with the values stored + on the file fdat on MPI process 0, assuming the latter were written to + the file by the program write_bc_parms(). + +double sea_quark_mass(int im0) + Returns the bare sea quark mass m0u if im0=0, m0s if im0=1 and m0c + if im0=2. In all other cases DBL_MAX is returned. + +sw_parms_t set_sw_parms(double m0) + Sets the parameters of the SW term. The parameter is + + m0 Bare quark mass. + + The return value is a structure that contains the mass m0 and the + improvement coefficients csw and cF, the latter being copied from + the list of the lattice parameters. + +sw_parms_t sw_parms(void) + Returns the parameters currently set for the SW term. The values + of the coefficients csw and cF are copied from the lattice parameter + list. + +tm_parms_t set_tm_parms(int io) + Sets the twisted-mass flag. The parameter is + + io Twisted-mass flag. If io=1, the twisted-mass term + in the Dirac operator, the SAP preconditioner and + the little Dirac operator is turned off on the odd + lattice sites. Otherwise it is applied everywhere. + + The return value is structure that contains the twisted-mass flag. + +tm_parms_t tm_parms(void) + Returns a structure containing the twisted-mass flag. + +mdint_parms_t set_mdint_parms(int ilv,integrator_t integrator,double lambda, + int nstep,int nfr,int *ifr) + Sets the parameters of the molecular-dynamics integrator at level + ilv and returns a structure containing them (see the notes). + +mdint_parms_t mdint_parms(int ilv) + Returns a structure containing the parameters of the integrator at + level ilv (see the notes). + +void read_mdint_parms(int ilv) + On process 0, this program scans stdin for a line starting with the + string "[Level ]" (after any number of blanks), where is + the integer value passed by the argument. An error occurs if no such + line or more than one is found. The lines + + integrator + lambda + nstep + forces [] + + are then read using read_line() [utils/mutils.c]. The line tagged + "lambda" is required only when the specified integrator is the 2nd + order OMF integrator. The line tagged "forces" must contain the + indices of the forces (separated by white space) that are to be + integrated at this level. On exit, the data are entered in the data + base by calling set_mdint_parms(ilv,...). + +void print_mdint_parms(void) + Prints the parameters of the defined integrator levels to stdout + on MPI process 0. + +void write_mdint_parms(FILE *fdat) + Writes the parameters of the defined integrator levels to the file + fdat on MPI process 0. + +void check_mdint_parms(FILE *fdat) + Compares the parameters of the defined integrator levels with those + stored on the file fdat on MPI process 0, assuming the latter were + written to the file by the program write_mdint_parms(). + +rat_parms_t set_rat_parms(int irp,int degree,double *range) + Sets the parameters in the rational function parameter set number + irp and returns a structure containing them (see the notes). + +rat_parms_t rat_parms(int irp) + Returns a structure containing the rational function parameter set + number irp (see the notes). + +void read_rat_parms(int irp) + On process 0, this program scans stdin for a line starting with the + string "[Rational ]" (after any number of blanks), where is + the integer value passed by the argument. An error occurs if no such + line or more than one is found. The lines + + degree + range + + are then read using read_line() [utils/mutils.c] and the data are + entered into the data base by calling set_rat_parms(). + +void print_rat_parms(void) + Prints the defined rational function parameter sets to stdout on MPI + process 0. + +void write_rat_parms(FILE *fdat) + Writes the defined rational function parameter sets to the file fdat + on MPI process 0. + +void check_rat_parms(FILE *fdat) + Compares the defined rational function parameter sets with those + on the file fdat on MPI process 0, assuming the latter were written + to the file by the program write_rat_parms(). + +rw_parms_t set_rw_parms(int irw,rwfact_t rwfact,int im0,int nsrc, + int irp,int nfct,double *mu,int *np,int *isp) + Sets the parameters in the reweighting factor parameter set number + irw and returns a structure containing them (see the notes). + +rw_parms_t rw_parms(int irw) + Returns a structure containing the reweighting factor parameter set + number irw (see the notes). + +void read_rw_parms(int irw) + On process 0, this program scans stdin for a line starting with the + string "[Reweighting factor ]" (after any number of blanks), where + is the integer value passed through the argument. An error occurs + if no such line or more than one is found. The lines + + rwfact + im0 + nsrc + irp + mu [] + np [] + isp [] + + are then read using read_line() [utils/mutils.c] and the data are + added to the data base by calling set_rw_parms(irw,...). Depending + on the value of "rwfact", some lines are not read and can be omitted + in the input file. The number of items on the lines with tag "mu", + "np" and "isp" depends on the reweighting factor too (see the notes). + +void print_rw_parms(void) + Prints the defined reweighting factor parameter sets to stdout on + MPI process 0. + +void write_rw_parms(FILE *fdat) + Writes the defined reweighting factor parameter sets to the file fdat + on MPI process 0. + +void check_rw_parms(FILE *fdat) + Compares the defined reweighting factor parameter sets with those + on the file fdat on MPI process 0, assuming the latter were written + to the file by the program write_rw_parms(). + +sap_parms_t set_sap_parms(int *bs,int isolv,int nmr,int ncy) + Sets the parameters of the SAP preconditioner. The parameters are + + bs[4] Sizes of the blocks in SAP_BLOCKS block grid. + + isolv Block solver to be used (0: plain MinRes, + 1: eo-preconditioned MinRes). + + nmr Number of block solver iterations. + + ncy Number of SAP cycles to be applied. + + The return value is a structure that contains the parameters of the + SAP preconditioners. The block sizes bs[4] can only be set once, but + the values of the other parameters may be changed by calling the + program again. + +sap_parms_t sap_parms(void) + Returns the parameters currently set for the SAP preconditioner. + +void print_sap_parms(int ipr) + Prints the SAP parameters to stdout on MPI process 0. Depending + on whether ipr!=0 or 0, the full information is printed or only + the block size. + +sf_parms_t set_sf_parms(double *phi,double *phi_prime) + Sets the parameters of the boundary fields in the Schroedinger + functional. The parameters are + + phi Angles phi[0],phi[1] at time 0. + + phi_prime Angles phi[0],phi[1] at time T. + + See the notes for further explanations. This program may only be + called once. + +sf_parms_t sf_parms(void) + Returns the parameters of the boundary fields currently set. + +void print_sf_parms(void) + Prints the parameters of the boundary fields to stdout on MPI + process 0. + +void write_sf_parms(FILE *fdat) + Writes the parameters of the boundary fields to the file fdat on + MPI process 0. + +void check_sf_parms(FILE *fdat) + Compares the parameters of the boundary fields with the values + stored on the file fdat on MPI process 0, assuming the latter were + written to the file by the program write_sf_parms(). + +int sf_flg(void) + Returns 1 if the Schroedinger-functional boundary values have been + initialized and 0 otherwise. + +solver_parms_t set_solver_parms(int isp,solver_t solver, + int nkv,int isolv,int nmr,int ncy, + int nmx,double res) + Sets the parameters in the solver parameter set number isp and returns + a structure containing them (see the notes). + +solver_parms_t solver_parms(int isp) + Returns a structure containing the solver parameter set number + isp (see the notes). + +void read_solver_parms(int isp) + On process 0, this program scans stdin for a line starting with the + string "[Solver ]" (after any number of blanks), where is + the integer value passed by the argument. An error occurs if no such + line or more than one is found. The lines + + solver + nkv + isolv + nmr + ncy + nmx + res + + are then read one by one using read_line() [utils/mutils.c]. The + lines with tags nkv,..,ncy may be absent in the case of the CGNE + and MSCG solvers (see the notes). The data are then added to the + data base by calling set_solver_parms(isp,...). + +void print_solver_parms(int *isap,int *idfl) + Prints the parameters of the defined solvers to stdout on MPI + process 0. On exit the flag isap is 1 or 0 depending on whether + one of the solvers makes use of the Schwarz Alternating Procedure + (SAP) or not. Similarly, the flag idfl is set 1 or 0 depending on + whether deflation is used or not. On MPI processes other than 0, + the program does nothing and sets isap and idfl to zero. + +void write_solver_parms(FILE *fdat) + Writes the parameters of the defined solvers to the file fdat on + MPI process 0. + +void check_solver_parms(FILE *fdat) + Compares the parameters of the defined solvers with those stored + on the file fdat on MPI process 0, assuming the latter were written + to the file by the program write_solver_parms(). Mismatches of the + maximal solver iteration number are not considered to be an error. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/README.flags b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/README.flags new file mode 100644 index 0000000000000000000000000000000000000000..d892065070035a696da75c7f9acdf8865cea3ac8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/README.flags @@ -0,0 +1,149 @@ + +******************************************************************************** + + Flags data base explained + +******************************************************************************** + + +Summary +------- + +The tasks carried out in a main program depend on the relevant preparatory +steps being taken in the proper order. The data base that is maintained by the +programs in this module enables the programmer to to check whether the field +arrays are in the proper condition for a specific task using the query_flags() +function. + +Internally this works by assigning a unique tag to every new gauge field +configuration. Other fields that depend on the gauge fields then inherit the +tag when they are calculated. Clearly the data base must be informed about any +steps taken. The function set_flags() does that for the case when the global +fields are concerned. The current lists of events and queries are defined in +the file flags.h and should be self-explanatory. + +In general the philosophy underlying the data base is that the flags reflect +the current contents of the field arrays that are monitored. To ensure the +consistency of the data base, any program that changes the fields must include +a corresponding set_flags() statement. There should be no exception to this +rule. + + +Full-lattice flags +------------------ + +The flags related to the global fields are stored in a structure + +struct +{ + int u,ud; + int udbuf,bstap,fts; + int sw[3],swd[3]; + int aw,awh; +} lat + +with the following elements: + +lat.u Tag of the current single-precision gauge field. + +lat.ud Tag of the current double-precision gauge field. + +lat.udbuf Tag of the double-precision field when its values + at the boundaries of the local lattice were last + copied from the neighbouring MPI processes. + +lat.bstap Tag of the double-precision gauge field when the + boundary staples were last calculated. + +lat.fts Tag of the double-precision gauge field when the + gauge-field tensor was last calculated. + +lat.sw[0] Tag of the gauge field from which the current + single-precision SW-term was calculated. + +lat.sw[1] Indicates whether the single-precision SW-term on + the even sites is inverted (lat.sw[1]=1) or not + (lat.sw[1]=0). + +lat.sw[2] Indicates whether the single-precision SW-term on + the odd sites is inverted (lat.sw[2]=1) or not + (lat.sw[2]=0). + +lat.swd[0] Tag of the gauge field from which the current + double-precision SW-term was calculated. + +lat.swd[1] Indicates whether the double-precision SW-term on + the even sites is inverted (lat.swd[1]=1) or not + (lat.swd[1]=0). + +lat.swd[2] Indicates whether the double-precision SW-term on + the odd sites is inverted (lat.swd[2]=1) or not + (lat.swd[2]=0). + +lat.aw Tag of the double-precision gauge field when the + little Dirac operator was last calculated. + +lat.awh Tag of the double-precision gauge field when the + even-odd preconditioned little Dirac operator was + last calculated. + +Block-grid flags +---------------- + +The data base monitors the fields on the block grids too. Flags are currently +set for two block grids (GCR_BLOCKS and DFL_BLOCKS), but further grids could +easily be incorporated. + +A complication arises from the fact that blocks may share some of the fields. +The data base only keeps track of the fields that are *not* shared. Querying +the status of a shared field is an error recorded by the error_loc() function. + +For each grid, the associated flags are contained in a structure + +typedef struct +{ + int shf; + int u,ud; + int sw[3],swd[3]; +} gf + +with the following elements: + +gf.shf Share flags of the blocks on the block grid. + The bits b1,b2 (counting from the lowest) in + this number are + + b1=1: b.u and bb.u are shared, + b2=1: b.ud and bb.ud are shared. + + All other bits are set to zero. + +gf.u Tag of the single-precision gauge field on the + blocks (=0 if the field is shared). + +gf.ud Tag of the double-precision gauge field on the + blocks (=0 if the field is shared). + +gf.sw[0] Tag of the gauge field at which the current + single-precision SW term on the blocks was + calculated (=0 if the gauge field is shared). + +gf.sw[1] Indicates whether the single-precision SW term + on the even sites of the block is inverted + (gf.sw[1]=1) or not (gf.sw[1]=0). + +gf.sw[2] Indicates whether the single-precision SW term + on the odd sites of the block is inverted + (gf.sw[2]=1) or not (gf.sw[2]=0). + +gf.swd[0] Tag of the gauge field from which the current + double-precision SW term on the block was + calculated (=0 if the gauge field is shared). + +gf.swd[1] Indicates whether the double-precision SW term + on the even sites on the block is inverted + (gf.swd[1]=1) or not (gf.swd[1]=0) + +gf.swd[2] Indicates whether the double-precision SW term + on the odd sites of the block is inverted + (gf.swd[2]=1) or not (gf.swd[2]=0) diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/action_parms.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/action_parms.c new file mode 100644 index 0000000000000000000000000000000000000000..0d815baa385a8823ccd7f888db14f0e124be0eb0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/action_parms.c @@ -0,0 +1,540 @@ + +/******************************************************************************* +* +* File action_parms.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Action parameter data base. +* +* The externally accessible functions are +* +* action_parms_t set_action_parms(int iact,action_t action,int ipf, +* int im0,int *irat,int *imu,int *isp) +* Sets the parameters in the action parameter set number iact and returns +* a structure containing them (see the notes). +* +* action_parms_t action_parms(int iact) +* Returns a structure containing the action parameter set number iact +* (see the notes). +* +* void read_action_parms(int iact) +* On process 0, this program scans stdin for a line starting with the +* string "[Action ]" (after any number of blanks), where is +* the integer value passed by the argument. An error occurs if no such +* line or more than one is found. The lines +* +* action +* ipf +* im0 +* irat +* imu [] +* isp [] +* +* are then read using read_line() [utils/mutils.c]. Depending on the +* value of "action", some lines are not read and can be omitted in +* the input file. The number of integer items on the lines with tag +* "imu" and "isp" depends on the action too. The data are then added +* to the data base by calling set_action_parms(iact,...). +* +* void print_action_parms(void) +* Prints the parameters of the defined actions to stdout on MPI +* process 0. +* +* void write_action_parms(FILE *fdat) +* Writes the parameters of the defined actions to the file fdat on +* MPI process 0. +* +* void check_action_parms(FILE *fdat) +* Compares the parameters of the defined actions with those stored +* on the file fdat on MPI process 0, assuming the latter were written +* to the file by the program write_action_parms(). +* +* Notes: +* +* For a description of the supported actions and their parameters see +* forces/README.forces. +* +* The elements of a structure of type action_parms_t are +* +* action Action program used. This parameter is an enum type with +* one of the following values: +* +* ACG (program action0() [forces/force0.c]), +* +* ACF_TM1 (program action1() [forces/force1.c]), +* +* ACF_TM1_EO (program action4() [forces/force4.c]), +* +* ACF_TM1_EO_SDET (program action4() [forces/force4.c]), +* +* ACF_TM2 (program action2() [forces/force2.c]), +* +* ACF_TM2_EO (program action5() [forces/force5.c]), +* +* ACF_RAT (program action3() [forces/force3.c]), +* +* ACF_RAT_SDET (program action3() [forces/force3.c]), +* +* ipf Pseudo-fermion field index (see mdflds/mdflds.c), +* +* im0 Index of the bare sea quark mass in parameter data base +* (see flags/lat_parms.c), +* +* irat Indices specifying a rational function (see ratfcts/ratfcts.c), +* +* imu Twisted mass indices (see flags/hmc_parms.c), +* +* isp Solver parameter set indices (see flags/solver_parms.c). +* +* Depending on the action, some parameters are not used and are set to zero +* by set_action_parms() independently of the values of the arguments. In +* particular, for a given action, only the required number of integers are +* read from the arrays imu and isp passed to the program. +* +* The number of twisted mass indices and solver parameter set indices is +* 1 and 2 in the case of the actions ACF_TM1* and ACF_TM2*, where isp[k] is +* the solver parameter set used for the solution of the Dirac equation with +* twisted mass index imu[k]. +* +* Up to 32 action parameter sets, labeled by an index iact=0,1,..,31, can +* be specified. Once a set is specified, it cannot be changed by calling +* set_action_parms() again. Action parameters must be globally the same. +* +* Except for action_parms(), the programs in this module perform global +* operations and must be called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define ACTION_PARMS_C + +#include +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +#define IACMAX 32 + +static int init=0; +static action_t action[]={ACG,ACF_TM1,ACF_TM1_EO,ACF_TM1_EO_SDET, + ACF_TM2,ACF_TM2_EO,ACF_RAT,ACF_RAT_SDET}; +static action_parms_t ap[IACMAX+1]={{ACTIONS,0,0,{0,0,0},{0,0,0,0},{0,0,0,0}}}; + + +static void init_ap(void) +{ + int i; + + for (i=1;i<=IACMAX;i++) + ap[i]=ap[0]; + + init=1; +} + + +action_parms_t set_action_parms(int iact,action_t action, + int ipf,int im0,int *irat,int *imu,int *isp) +{ + int iprms[15],i,ie; + int rat[3],mu[4],sp[4]; + + if (init==0) + init_ap(); + + for (i=0;i<3;i++) + rat[i]=0; + + for (i=0;i<4;i++) + { + mu[i]=0; + sp[i]=0; + } + + if ((action==ACG)||(action==ACTIONS)) + { + ipf=0; + im0=0; + } + else if ((action==ACF_TM1)||(action==ACF_TM1_EO)||(action==ACF_TM1_EO_SDET)) + { + mu[0]=imu[0]; + sp[0]=isp[0]; + } + else if ((action==ACF_TM2)||(action==ACF_TM2_EO)) + { + mu[0]=imu[0]; + mu[1]=imu[1]; + sp[0]=isp[0]; + sp[1]=isp[1]; + } + else if ((action==ACF_RAT)||(action==ACF_RAT_SDET)) + { + rat[0]=irat[0]; + rat[1]=irat[1]; + rat[2]=irat[2]; + sp[0]=isp[0]; + } + + if (NPROC>1) + { + iprms[0]=iact; + iprms[1]=(int)(action); + iprms[2]=ipf; + iprms[3]=im0; + + for (i=0;i<3;i++) + iprms[4+i]=rat[i]; + + for (i=0;i<4;i++) + { + iprms[7+i]=mu[i]; + iprms[11+i]=sp[i]; + } + + MPI_Bcast(iprms,15,MPI_INT,0,MPI_COMM_WORLD); + + ie=0; + ie|=(iprms[0]!=iact); + ie|=(iprms[1]!=(int)(action)); + ie|=(iprms[2]!=ipf); + ie|=(iprms[3]!=im0); + + for (i=0;i<3;i++) + ie|=(iprms[4+i]!=rat[i]); + + for (i=0;i<4;i++) + { + ie|=(iprms[7+i]!=mu[i]); + ie|=(iprms[11+i]!=sp[i]); + } + + error(ie!=0,1,"set_action_parms [action_parms.c]", + "Parameters are not global"); + } + + ie=0; + ie|=((iact<0)||(iact>=IACMAX)); + ie|=(action==ACTIONS); + ie|=((ipf<0)||(im0<0)); + + for (i=0;i<3;i++) + ie|=(rat[i]<0); + + for (i=0;i<4;i++) + ie|=((mu[i]<0)||(sp[i]<0)); + + error_root(ie!=0,1,"set_action_parms [action_parms.c]", + "Parameters are out of range"); + + error_root(ap[iact].action!=ACTIONS,1,"set_action_parms [action_parms.c]", + "Attempt to reset already specified action parameters"); + + ap[iact].action=action; + ap[iact].ipf=ipf; + ap[iact].im0=im0; + + for (i=0;i<3;i++) + ap[iact].irat[i]=rat[i]; + + for (i=0;i<4;i++) + { + ap[iact].imu[i]=mu[i]; + ap[iact].isp[i]=sp[i]; + } + + return ap[iact]; +} + + +action_parms_t action_parms(int iact) +{ + if (init==0) + init_ap(); + + if ((iact>=0)&&(iact1) + { + MPI_Bcast(&ida,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ipf,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&im0,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(irat,3,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(imu,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(isp,4,MPI_INT,0,MPI_COMM_WORLD); + } + + set_action_parms(iact,action[ida],ipf,im0,irat,imu,isp); +} + + +void print_action_parms(void) +{ + int my_rank,i; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if ((my_rank==0)&&(init==1)) + { + for (i=0;i=4). +* +* nmr Number of block minimal residual iterations to be +* used when the SAP smoother is applied. +* +* ncy Number of SAP cycles per inverse iteration. +* +* The return value is a structure that contains the above parameters and +* the bare mass m0 that corresponds to the hopping parameter kappa. +* +* dfl_gen_parms_t dfl_gen_parms(void) +* Returns the parameters currently set for the generation of the deflation +* subspace plus the corresponding bare mass m0. +* +* dfl_upd_parms_t set_dfl_upd_parms(double dtau,int nsm) +* Sets the parameters of the deflation subspace update scheme. The +* parameters are +* +* dtau Molecular-dynamics time separation between +* updates of the deflation subspace. +* +* nsm Number of deflated smoothing interations to be +* applied when the subspace is updated. +* +* The return value is a structure that contains the above parameters. +* +* dfl_upd_parms_t dfl_upd_parms(void) +* Returns the parameters currently set for the deflation subspace +* update scheme. +* +* void print_dfl_parms(int ipr) +* Prints the parameters of the deflation subspace, the projectors, the +* subspace generation algorithm and the update scheme to stdout on MPI +* process 0. The update scheme is omitted if ipr=0. +* +* void write_dfl_parms(FILE *fdat) +* Writes the parameters of the deflation subspace, the projectors, the +* subspace generation algorithm and the update scheme to the file fdat +* on MPI process 0. +* +* void check_dfl_parms(FILE *fdat) +* Compares the parameters of the deflation subspace, the projectors the +* subspace generation algorithm and the update scheme with the values +* stored on the file fdat on MPI process 0, assuming the latter were +* written to the file by the program write_dfl_parms() (mismatches of +* maximal solver iteration numbers are not considered to be an error). +* +* Notes: +* +* To ensure the consistency of the data base, the parameters must be set +* simultaneously on all processes. The types dfl_parms_t, ... are defined +* in the file flags.h. +* +*******************************************************************************/ + +#define DFL_PARMS_C + +#include +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +static dfl_parms_t dfl={{0,0,0,0},0}; +static dfl_pro_parms_t dfl_pro={0,0,1.0}; +static dfl_gen_parms_t dfl_gen={0,0,0,0.0,DBL_MAX,0.0}; +static dfl_upd_parms_t dfl_upd={0,0.0}; + + +static void check_block_size(int *bs) +{ + int n0,n1,n2,n3; + + error_root((bs[0]<4)||(bs[1]<4)||(bs[2]<4)||(bs[3]<4)|| + (bs[0]>L0)||(bs[1]>L1)||(bs[2]>L2)||(bs[3]>L3),1, + "check_block_size [dfl_parms.c]", + "Block sizes are out of range"); + + error_root((bs[0]%2)||(bs[1]%2)||(bs[2]%2)||(bs[3]%2),1, + "check_block_size [dfl_parms.c]", + "Block sizes must be even"); + + error_root((L0%bs[0])||(L1%bs[1])||(L2%bs[2])||(L3%bs[3]),1, + "check_block_size [dfl_parms.c]", + "Blocks do not divide the local lattice"); + + n0=L0/bs[0]; + n1=L1/bs[1]; + n2=L2/bs[2]; + n3=L3/bs[3]; + + error_root(((NPROC0*n0)%2)||((NPROC1*n1)%2)|| + ((NPROC2*n2)%2)||((NPROC3*n3)%2),1, + "check_block_size [dfl_parms.c]", + "There must be an even number of blocks in each direction"); + + error_root((n0*n1*n2*n3)%2,1, + "check_block_size [dfl_parms.c]", + "The number of blocks in the local lattice must be even"); +} + + +dfl_parms_t set_dfl_parms(int *bs,int Ns) +{ + int iprms[5]; + + if (NPROC>1) + { + iprms[0]=bs[0]; + iprms[1]=bs[1]; + iprms[2]=bs[2]; + iprms[3]=bs[3]; + iprms[4]=Ns; + + MPI_Bcast(iprms,5,MPI_INT,0,MPI_COMM_WORLD); + + error((iprms[0]!=bs[0])||(iprms[1]!=bs[1])||(iprms[2]!=bs[2])|| + (iprms[3]!=bs[3])||(iprms[4]!=Ns),1, + "set_dfl_parms [dfl_parms.c]","Parameters are not global"); + } + + error_root((dfl.Ns>0)&&((bs[0]!=dfl.bs[0])||(bs[1]!=dfl.bs[1])|| + (bs[2]!=dfl.bs[2])||(bs[3]!=dfl.bs[3])|| + (Ns!=dfl.Ns)),1, + "set_dfl_parms [dfl_parms.c]","bs[4] and Ns may be set only once"); + + check_block_size(bs); + error_root((Ns<2)||(Ns&0x1),1,"set_dfl_parms [dfl_parms.c]", + "Improper value of Ns"); + + dfl.bs[0]=bs[0]; + dfl.bs[1]=bs[1]; + dfl.bs[2]=bs[2]; + dfl.bs[3]=bs[3]; + dfl.Ns=Ns; + + return dfl; +} + + +dfl_parms_t dfl_parms(void) +{ + return dfl; +} + + +dfl_pro_parms_t set_dfl_pro_parms(int nkv,int nmx,double res) +{ + int iprms[2]; + double dprms[1]; + + if (NPROC>1) + { + iprms[0]=nkv; + iprms[1]=nmx; + + dprms[0]=res; + + MPI_Bcast(iprms,2,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=nkv)||(iprms[1]!=nmx)||(dprms[0]!=res),1, + "set_dfl_pro_parms [dfl_parms.c]","Parameters are not global"); + } + + error_root((nkv<1)||(nmx<1)||(res<=DBL_EPSILON),1, + "set_dfl_pro_parms [dfl_parms.c]","Improper parameter values"); + + dfl_pro.nkv=nkv; + dfl_pro.nmx=nmx; + dfl_pro.res=res; + + return dfl_pro; +} + + +dfl_pro_parms_t dfl_pro_parms(void) +{ + return dfl_pro; +} + + +dfl_gen_parms_t set_dfl_gen_parms(double kappa,double mu, + int ninv,int nmr,int ncy) +{ + int iprms[3]; + double dprms[2]; + + if (NPROC>1) + { + iprms[0]=ninv; + iprms[1]=nmr; + iprms[2]=ncy; + + dprms[0]=kappa; + dprms[1]=mu; + + MPI_Bcast(iprms,3,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=ninv)||(iprms[1]!=nmr)||(iprms[2]!=ncy)|| + (dprms[0]!=kappa)||(dprms[1]!=mu),1, + "set_dfl_gen_parms [dfl_parms.c]","Parameters are not global"); + } + + error_root((ninv<4)||(nmr<1)||(ncy<1)||(kappa<0.0),1, + "set_dfl_gen_parms [dfl_parms.c]","Parameters are out of range"); + + dfl_gen.ninv=ninv; + dfl_gen.nmr=nmr; + dfl_gen.ncy=ncy; + + dfl_gen.kappa=kappa; + dfl_gen.mu=mu; + + if (kappa!=0.0) + dfl_gen.m0=1.0/(2.0*kappa)-4.0; + else + dfl_gen.m0=DBL_MAX; + + return dfl_gen; +} + + +dfl_gen_parms_t dfl_gen_parms(void) +{ + return dfl_gen; +} + + +dfl_upd_parms_t set_dfl_upd_parms(double dtau,int nsm) +{ + int iprms[1]; + double dprms[1]; + + if (NPROC>1) + { + iprms[0]=nsm; + dprms[0]=dtau; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=nsm)||(dprms[0]!=dtau),1, + "set_dfl_upd_parms [dfl_parms.c]","Parameters are not global"); + } + + error_root((dtau<0.0)||(nsm<0),1, + "set_dfl_upd_parms [dfl_parms.c]","Improper parameter values"); + + dfl_upd.dtau=dtau; + dfl_upd.nsm=nsm; + + return dfl_upd; +} + + +dfl_upd_parms_t dfl_upd_parms(void) +{ + return dfl_upd; +} + + +void print_dfl_parms(int ipr) +{ + int my_rank,n; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + printf("Deflation subspace:\n"); + printf("bs = %d %d %d %d\n",dfl.bs[0],dfl.bs[1],dfl.bs[2],dfl.bs[3]); + printf("Ns = %d\n\n",dfl.Ns); + + printf("Deflation projection:\n"); + printf("nkv = %d\n",dfl_pro.nkv); + printf("nmx = %d\n",dfl_pro.nmx); + printf("res = %.1e\n\n",dfl_pro.res); + + printf("Deflation subspace generation:\n"); + n=fdigits(dfl_gen.kappa); + printf("kappa = %.*f\n",IMAX(n,6),dfl_gen.kappa); + n=fdigits(dfl_gen.mu); + printf("mu = %.*f\n",IMAX(n,1),dfl_gen.mu); + printf("ninv = %d\n",dfl_gen.ninv); + printf("nmr = %d\n",dfl_gen.nmr); + printf("ncy = %d\n\n",dfl_gen.ncy); + + if (ipr) + { + printf("Deflation subspace update scheme:\n"); + n=fdigits(dfl_upd.dtau); + printf("dtau = %.*f\n",IMAX(n,1),dfl_upd.dtau); + printf("nsm = %d\n\n",dfl_upd.nsm); + } + } +} + + +void write_dfl_parms(FILE *fdat) +{ + int my_rank,endian; + int i,iw; + stdint_t istd[11]; + double dstd[4]; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + endian=endianness(); + + if (my_rank==0) + { + for (i=0;i<4;i++) + istd[i]=(stdint_t)(dfl.bs[i]); + + istd[4]=(stdint_t)(dfl.Ns); + istd[5]=(stdint_t)(dfl_pro.nkv); + istd[6]=(stdint_t)(dfl_pro.nmx); + istd[7]=(stdint_t)(dfl_gen.ninv); + istd[8]=(stdint_t)(dfl_gen.nmr); + istd[9]=(stdint_t)(dfl_gen.ncy); + istd[10]=(stdint_t)(dfl_upd.nsm); + + dstd[0]=dfl_pro.res; + dstd[1]=dfl_gen.kappa; + dstd[2]=dfl_gen.mu; + dstd[3]=dfl_upd.dtau; + + if (endian==BIG_ENDIAN) + { + bswap_int(11,istd); + bswap_double(4,dstd); + } + + iw=fwrite(istd,sizeof(stdint_t),11,fdat); + iw+=fwrite(dstd,sizeof(double),4,fdat); + error_root(iw!=15,1,"write_dfl_parms [dfl_parms.c]", + "Incorrect write count"); + } +} + + +void check_dfl_parms(FILE *fdat) +{ + int my_rank,endian; + int i,ir,ie; + stdint_t istd[11]; + double dstd[4]; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + endian=endianness(); + + if (my_rank==0) + { + ir=fread(istd,sizeof(stdint_t),11,fdat); + ir+=fread(dstd,sizeof(double),4,fdat); + error_root(ir!=15,1,"check_dfl_parms [dfl_parms.c]", + "Incorrect read count"); + + if (endian==BIG_ENDIAN) + { + bswap_int(11,istd); + bswap_double(4,dstd); + } + + ie=0; + + for (i=0;i<4;i++) + ie|=(istd[i]!=(stdint_t)(dfl.bs[i])); + + ie|=(istd[4]!=(stdint_t)(dfl.Ns)); + ie|=(istd[5]!=(stdint_t)(dfl_pro.nkv)); + ie|=(istd[7]!=(stdint_t)(dfl_gen.ninv)); + ie|=(istd[8]!=(stdint_t)(dfl_gen.nmr)); + ie|=(istd[9]!=(stdint_t)(dfl_gen.ncy)); + ie|=(istd[10]!=(stdint_t)(dfl_upd.nsm)); + + ie|=(dstd[0]!=dfl_pro.res); + ie|=(dstd[1]!=dfl_gen.kappa); + ie|=(dstd[2]!=dfl_gen.mu); + ie|=(dstd[3]!=dfl_upd.dtau); + + error_root(ie!=0,1,"check_dfl_parms [dfl_parms.c]", + "Parameters do not match"); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/flags.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/flags.c new file mode 100644 index 0000000000000000000000000000000000000000..94a49035c6af827a1a23cfd89203114033559330 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/flags.c @@ -0,0 +1,391 @@ + +/******************************************************************************* +* +* File flags.c +* +* Copyright (C) 2009, 2011, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Flags data base input and query programs +* +* The externally accessible functions are +* +* void set_flags(event_t event) +* Reports an event to the data base, where some of the global field +* arrays are changed. +* +* void set_grid_flags(blk_grid_t grid,event_t event) +* Reports an event to the data base, where some of the field arrays +* on the specified block grid are changed. +* +* int query_flags(query_t query) +* Queries the data base on the status of the global field arrays. +* The program returns 1 or 0 depending on whether the answer to the +* specified query is "yes" or "no". If the query is unknown to the +* the data base, the program returns -1. +* +* int query_grid_flags(blk_grid_t grid,query_t query) +* Queries the data base on the status of the field arrays on the +* specified block grid. The program returns 1 or 0 depending on +* whether the answer to the specified query is "yes" or "no". If +* the query is unknown to the data base, the program returns -1. +* +* void print_flags(void) +* Prints the current values of all flags describing the state of +* the global field arrays to stdout on process 0. +* +* void print_grid_flags(blk_grid_t grid) +* Prints the current values of all flags describing the state of +* the field arrays on the specified block grid to stdout on +* process 0. +* +* Notes: +* +* The programs set_flags() and set_grid_flags() perform global operations +* and must be called on all processes simultaneously. As a consequence, +* the contents of the data base is the same everywhere. All other programs +* in this module can be called locally. +* +* The possible events and queries are defined in the header file flags.h. +* The associated actions are defined in the *.h files in the include/flags +* directory (application programs do not need to include these). +* +* For further explanations, see the file README.flags in this directory. +* +*******************************************************************************/ + +#define FLAGS_C + +#include +#include +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +#define NFLGS (9+4*(int)(BLK_GRIDS)) + +static struct +{ + int u,ud,udbuf; + int bstap,fts; + int sw[3],swd[3]; + int aw,awh; +} lat={0,0,0,0,0,{0,0,0},{0,0,0},0,0}; + +typedef struct +{ + int shf; + int u,ud; + int sw[3],swd[3]; +} grid_flags_t; + +static int init=0,tag=0; +static int flgs[NFLGS]; +static grid_flags_t gfs[(int)(BLK_GRIDS)+1]={{0x0,0,0,{0,0,0},{0,0,0}}},*gf; + + +static void set_flgs(void) +{ + int n,igr; + + flgs[0]=lat.u; + flgs[1]=lat.ud; + flgs[2]=lat.udbuf; + flgs[3]=lat.bstap; + flgs[4]=lat.fts; + flgs[5]=lat.sw[0]; + flgs[6]=lat.swd[0]; + flgs[7]=lat.aw; + flgs[8]=lat.awh; + + n=9; + + for (igr=0;igr<(int)(BLK_GRIDS);igr++) + { + flgs[n++]=gfs[igr].u; + flgs[n++]=gfs[igr].ud; + flgs[n++]=gfs[igr].sw[0]; + flgs[n++]=gfs[igr].swd[0]; + } +} + + +static void find_gap(int *a,int *d) +{ + int k,l; + int fk,h,hmax; + + (*a)=0; + (*d)=INT_MAX; + + for (k=0;k0)&&(h<(*d))) + (*d)=h; + } + + for (k=0;k0)&&(h(*d)) + { + (*a)=fk; + (*d)=hmax; + } + } +} + + +static void compress_flags(void) +{ + int k,a,d; + int n,igr; + + set_flgs(); + find_gap(&a,&d); + d-=1; + + for (k=0;ka) + flgs[k]-=d; + } + + lat.u=flgs[0]; + lat.ud=flgs[1]; + lat.udbuf=flgs[2]; + lat.bstap=flgs[3]; + lat.fts=flgs[4]; + lat.sw[0]=flgs[5]; + lat.swd[0]=flgs[6]; + lat.aw=flgs[7]; + lat.awh=flgs[8]; + + n=9; + + for (igr=0;igr<(int)(BLK_GRIDS);igr++) + { + gfs[igr].u=flgs[n++]; + gfs[igr].ud=flgs[n++]; + gfs[igr].sw[0]=flgs[n++]; + gfs[igr].swd[0]=flgs[n++]; + } + + tag-=d; +} + + +static int next_tag(void) +{ + if (tag==INT_MAX) + compress_flags(); + tag+=1; + + return tag; +} + +#include "flags/events.h" +#include "flags/grid_events.h" +#include "flags/queries.h" +#include "flags/grid_queries.h" + +static void set_arrays(void) +{ + int igr; + + for (igr=1;igr<=(int)(BLK_GRIDS);igr++) + gfs[igr]=gfs[0]; + + gfs[(int)(SAP_BLOCKS)].shf=0x0; + gfs[(int)(DFL_BLOCKS)].shf=0x2; + + set_events(); + set_grid_events(); + set_queries(); + set_grid_queries(); + + init=1; +} + + +void set_flags(event_t event) +{ + int iprms[1],iev; + + if (init==0) + set_arrays(); + + iev=(int)(event); + + if (NPROC>1) + { + iprms[0]=iev; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + + error(iprms[0]!=iev,1,"set_flags [flags.c]", + "Parameter is not global"); + } + + if (event_fcts[iev]==NULL) + error_root(1,1,"set_flags [flags.c]","No action associated to event"); + else + event_fcts[iev](); +} + + +void set_grid_flags(blk_grid_t grid,event_t event) +{ + int iprms[2],igr,iev; + + if (init==0) + set_arrays(); + + igr=(int)(grid); + iev=(int)(event); + + if (NPROC>1) + { + iprms[0]=igr; + iprms[1]=iev; + + MPI_Bcast(iprms,2,MPI_INT,0,MPI_COMM_WORLD); + + error((iprms[0]!=igr)||(iprms[1]!=iev),1, + "set_grid_flags [flags.c]","Parameters are not global"); + } + + if (grid==BLK_GRIDS) + error_root(1,1,"set_grid_flags [flags.c]", + "BLK_GRIDS is a dummy block grid"); + + if (grid_event_fcts[iev]==NULL) + error_root(1,1,"set_grid_flags [flags.c]", + "No action associated to event"); + else + { + gf=gfs+igr; + grid_event_fcts[iev](); + } +} + + +int query_flags(query_t query) +{ + int iqr; + + if (init==0) + set_arrays(); + + iqr=(int)(query); + + if (query_fcts[iqr]==NULL) + { + error_loc(1,1,"query_flags [flags.c]","No response to query"); + return -1; + } + else + return query_fcts[iqr](); +} + + +int query_grid_flags(blk_grid_t grid,query_t query) +{ + int iqr; + + if (init==0) + set_arrays(); + + iqr=(int)(query); + + if (grid_query_fcts[iqr]==NULL) + { + error_loc(1,1,"query_grid_flags [flags.c]","No response to query"); + return -1; + } + else + { + gf=gfs+(int)(grid); + return grid_query_fcts[iqr](); + } +} + + +void print_flags(void) +{ + int my_rank; + + if (init==0) + set_arrays(); + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + printf("Full lattice flags:\n"); + printf("u = %d\n",lat.u); + printf("ud,udbuf = %d,%d\n",lat.ud,lat.udbuf); + printf("bstap,fts = %d,%d\n",lat.bstap,lat.fts); + printf("sw = %d,%d,%d\n", + lat.sw[0],lat.sw[1],lat.sw[2]); + printf("swd = %d,%d,%d\n", + lat.swd[0],lat.swd[1],lat.swd[2]); + printf("aw,awh = %d,%d\n",lat.aw,lat.awh); + printf("\n"); + } +} + + +void print_grid_flags(blk_grid_t grid) +{ + int my_rank; + + if (init==0) + set_arrays(); + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + gf=gfs+(int)(grid); + + if (grid==SAP_BLOCKS) + printf("Flags on the SAP block grid:\n"); + else if (grid==DFL_BLOCKS) + printf("Flags on the DFL block grid:\n"); + else + error_root(1,1,"print_grid_flags [flags.c]","Unknown block grid"); + + printf("shf = %#x\n",(*gf).shf); + printf("u = %d\n",(*gf).u); + printf("ud = %d\n",(*gf).ud); + printf("sw = %d,%d,%d\n", + (*gf).sw[0],(*gf).sw[1],(*gf).sw[2]); + printf("swd = %d,%d,%d\n", + (*gf).swd[0],(*gf).swd[1],(*gf).swd[2]); + printf("\n"); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/force_parms.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/force_parms.c new file mode 100644 index 0000000000000000000000000000000000000000..52cb72689dd4799ed148c022f210ce03b5d3d4c6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/force_parms.c @@ -0,0 +1,799 @@ + +/******************************************************************************* +* +* File force_parms.c +* +* Copyright (C) 2011, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Force parameter data base +* +* The externally accessible functions are +* +* force_parms_t set_force_parms(int ifr,force_t force,int ipf,int im0, +* int *irat,int *imu,int *isp,int *ncr) +* Sets the parameters in the force parameter set number ifr and returns +* a structure containing them (see the notes). +* +* force_parms_t force_parms(int ifr) +* Returns a structure containing the force parameter set number ifr +* (see the notes). +* +* void read_force_parms(int ifr) +* On process 0, this program scans stdin for a line starting with the +* string "[Force ]" (after any number of blanks), where is +* the integer value passed by the argument. An error occurs if no such +* line or more than one is found. The lines +* +* force +* ipf +* im0 +* irat +* imu [] +* isp [] +* ncr [] +* +* are then read using read_line() [utils/mutils.c]. Depending on the +* value of "force", some lines are not read and can be omitted in the +* input file. The number of integer items on the lines with tag "imu" +* and "isp" and "ncr" depends on the force too. The data are then added +* to the data base by calling set_force_parms(ifr,...). +* +* void read_force_parms2(int ifr) +* Same as read_force_parms() except that only the lines +* +* force +* isp [] +* ncr [] +* +* are read from stdin. All other force parameters are inferred from +* the parameters of the action no ifr so that the force is the one +* deriving from that action. An error occurs if the parameters of the +* action no ifr have not previously been added to the data base or +* if the force and action types do not match. +* +* void print_force_parms(void) +* Prints the parameters of the defined forces to stdout on MPI +* process 0. +* +* void print_force_parms2(void) +* Prints the parameters of the defined forces to stdout on MPI +* process 0 in a short format corresponding to read_force_parms2(). +* +* void write_force_parms(FILE *fdat) +* Writes the parameters of the defined forces to the file fdat on +* MPI process 0. +* +* void check_force_parms(FILE *fdat) +* Compares the parameters of the defined forces with those stored +* on the file fdat on MPI process 0, assuming the latter were written +* to the file by the program write_force_parms(). +* +* Notes: +* +* For a description of the supported forces and their parameters see +* forces/README.forces. +* +* The elements of a structure of type force_parms_t are +* +* force Force program used. This parameter is an enum type with +* one of the following values: +* +* FRG (program force0() [forces/force0.c]), +* +* FRF_TM1 (program force1() [forces/force1.c]), +* +* FRF_TM1_EO (program force4() [forces/force4.c]), +* +* FRF_TM1_EO_SDET (program force4() [forces/force4.c]), +* +* FRF_TM2 (program force2() [forces/force2.c]), +* +* FRF_TM2_EO (program force5() [forces/force5.c]), +* +* FRF_RAT (program force3() [forces/force3.c]), +* +* FRF_RAT_SDET (program force3() [forces/force3.c]), +* +* ipf Pseudo-fermion field index (see mdflds/mdflds.c), +* +* im0 Index of the bare sea quark mass in parameter data base +* (see flags/lat_parms.c), +* +* irat Indices specifying a rational function (see ratfcts/ratfcts.c), +* +* imu Twisted mass indices (see flags/hmc_parms.c), +* +* isp Solver parameter set indices (see flags/solver_parms.c), +* +* ncr Chronological solver stack sizes (see update/chrono.c), +* +* icr Chronological solver stack indices (set internally). +* +* Depending on the force, some parameters are not used and are set to zero +* by set_force_parms() independently of the values of the arguments. In +* particular, for a given force, only the required number of integers are +* read from the arrays imu, isp and ncr passed to the program. +* +* The number of twisted mass indices is 1 and 2 in the case of the forces +* FRF_TM1* and FRF_TM2*, respectively. These forces require a chronological +* solver stack size to be specified and 1 solver parameter set to be used +* for the solution of the Dirac equation with twisted mass index imu[0]. +* +* Up to 32 force parameter sets, labeled by an index ifr=0,1,..,31, can +* be specified. Once a set is specified, it cannot be changed by calling +* set_force_parms() again. Force parameters must be globally the same. +* +* Except for force_parms(), the programs in this module perform global +* operations and must be called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define FORCE_PARMS_C + +#include +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +#define IFRMAX 32 + +static int init=0,icr=0; +static force_t force[]={FRG,FRF_TM1,FRF_TM1_EO,FRF_TM1_EO_SDET, + FRF_TM2,FRF_TM2_EO,FRF_RAT,FRF_RAT_SDET}; +static force_parms_t fp[IFRMAX+1]={{FORCES,0,0,{0,0,0},{0,0,0,0},{0,0,0,0}, + {0,0,0,0},{0,0,0,0}}}; + + +static void init_fp(void) +{ + int i; + + for (i=1;i<=IFRMAX;i++) + fp[i]=fp[0]; + + init=1; +} + + +force_parms_t set_force_parms(int ifr,force_t force,int ipf,int im0, + int *irat,int *imu,int *isp,int *ncr) +{ + int iprms[23],i,ie; + int rat[3],mu[4],sp[4],nc[4],ic[4]; + + if (init==0) + init_fp(); + + for (i=0;i<3;i++) + rat[i]=0; + + for (i=0;i<4;i++) + { + mu[i]=0; + sp[i]=0; + nc[i]=0; + ic[i]=0; + } + + if ((force==FRG)||(force==FORCES)) + { + ipf=0; + im0=0; + } + else if ((force==FRF_TM1)||(force==FRF_TM1_EO)||(force==FRF_TM1_EO_SDET)) + { + mu[0]=imu[0]; + sp[0]=isp[0]; + + if (ncr[0]>0) + { + icr+=1; + nc[0]=ncr[0]; + ic[0]=icr; + } + } + else if ((force==FRF_TM2)||(force==FRF_TM2_EO)) + { + mu[0]=imu[0]; + mu[1]=imu[1]; + sp[0]=isp[0]; + + if (ncr[0]>0) + { + icr+=1; + nc[0]=ncr[0]; + ic[0]=icr; + } + } + else if ((force==FRF_RAT)||(force==FRF_RAT_SDET)) + { + rat[0]=irat[0]; + rat[1]=irat[1]; + rat[2]=irat[2]; + sp[0]=isp[0]; + } + + if (NPROC>1) + { + iprms[0]=ifr; + iprms[1]=(int)(force); + iprms[2]=ipf; + iprms[3]=im0; + + for (i=0;i<3;i++) + iprms[4+i]=rat[i]; + + for (i=0;i<4;i++) + { + iprms[7+i]=mu[i]; + iprms[11+i]=sp[i]; + iprms[15+i]=nc[i]; + iprms[19+i]=ic[i]; + } + + MPI_Bcast(iprms,23,MPI_INT,0,MPI_COMM_WORLD); + + ie=0; + ie|=(iprms[0]!=ifr); + ie|=(iprms[1]!=(int)(force)); + ie|=(iprms[2]!=ipf); + ie|=(iprms[3]!=im0); + + for (i=0;i<3;i++) + ie|=(iprms[4+i]!=rat[i]); + + for (i=0;i<4;i++) + { + ie|=(iprms[7+i]!=mu[i]); + ie|=(iprms[11+i]!=sp[i]); + ie|=(iprms[15+i]!=nc[i]); + ie|=(iprms[19+i]!=ic[i]); + } + + error(ie!=0,1,"set_force_parms [force_parms.c]", + "Parameters are not global"); + } + + ie=0; + ie|=((ifr<0)||(ifr>=IFRMAX)); + ie|=(force==FORCES); + ie|=((ipf<0)||(im0<0)); + + for (i=0;i<3;i++) + ie|=(rat[i]<0); + + for (i=0;i<4;i++) + { + ie|=(mu[i]<0); + ie|=(sp[i]<0); + ie|=(nc[i]<0); + } + + error_root(ie!=0,1,"set_force_parms [force_parms.c]", + "Parameters are out of range"); + + error_root(fp[ifr].force!=FORCES,1,"set_force_parms [force_parms.c]", + "Attempt to reset already specified force parameters"); + + fp[ifr].force=force; + fp[ifr].ipf=ipf; + fp[ifr].im0=im0; + + for (i=0;i<3;i++) + fp[ifr].irat[i]=rat[i]; + + for (i=0;i<4;i++) + { + fp[ifr].imu[i]=mu[i]; + fp[ifr].isp[i]=sp[i]; + fp[ifr].ncr[i]=nc[i]; + fp[ifr].icr[i]=ic[i]; + } + + return fp[ifr]; +} + + +force_parms_t force_parms(int ifr) +{ + if (init==0) + init_fp(); + + if ((ifr>=0)&&(ifr1) + { + MPI_Bcast(&idf,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ipf,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&im0,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(irat,3,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(imu,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(isp,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(ncr,4,MPI_INT,0,MPI_COMM_WORLD); + } + + set_force_parms(ifr,force[idf],ipf,im0,irat,imu,isp,ncr); +} + + +void read_force_parms2(int ifr) +{ + int my_rank,i,ie,idf; + int ipf,im0,irat[3],imu[4],isp[4],ncr[4]; + char line[NAME_SIZE]; + action_parms_t ap; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + ie=0; + idf=0; + ipf=0; + im0=0; + + for (i=0;i<3;i++) + irat[i]=0; + + for (i=0;i<4;i++) + { + imu[i]=0; + isp[i]=0; + ncr[i]=0; + } + + if (my_rank==0) + { + ap=action_parms(ifr); + error_root(ap.action==ACTIONS,1,"read_force_parms2 [force_parms.c]", + "Undefined action"); + + sprintf(line,"Force %d",ifr); + find_section(line); + read_line("force","%s",line); + + if (ap.action==ACG) + ie=strcmp(line,"FRG"); + else if (ap.action==ACF_TM1) + { + ie=strcmp(line,"FRF_TM1"); + idf=1; + ipf=ap.ipf; + im0=ap.im0; + imu[0]=ap.imu[0]; + read_line("isp","%d",isp); + read_line("ncr","%d",ncr); + } + else if (ap.action==ACF_TM1_EO) + { + ie=strcmp(line,"FRF_TM1_EO"); + idf=2; + ipf=ap.ipf; + im0=ap.im0; + imu[0]=ap.imu[0]; + read_line("isp","%d",isp); + read_line("ncr","%d",ncr); + } + else if (ap.action==ACF_TM1_EO_SDET) + { + ie=strcmp(line,"FRF_TM1_EO_SDET"); + idf=3; + ipf=ap.ipf; + im0=ap.im0; + imu[0]=ap.imu[0]; + read_line("isp","%d",isp); + read_line("ncr","%d",ncr); + } + else if (ap.action==ACF_TM2) + { + ie=strcmp(line,"FRF_TM2"); + idf=4; + ipf=ap.ipf; + im0=ap.im0; + imu[0]=ap.imu[0]; + imu[1]=ap.imu[1]; + read_line("isp","%d",isp); + read_line("ncr","%d",ncr); + } + else if (ap.action==ACF_TM2_EO) + { + ie=strcmp(line,"FRF_TM2_EO"); + idf=5; + ipf=ap.ipf; + im0=ap.im0; + imu[0]=ap.imu[0]; + imu[1]=ap.imu[1]; + read_line("isp","%d",isp); + read_line("ncr","%d",ncr); + } + else if (ap.action==ACF_RAT) + { + ie=strcmp(line,"FRF_RAT"); + idf=6; + ipf=ap.ipf; + im0=ap.im0; + irat[0]=ap.irat[0]; + irat[1]=ap.irat[1]; + irat[2]=ap.irat[2]; + read_line("isp","%d",isp); + } + else if (ap.action==ACF_RAT_SDET) + { + ie=strcmp(line,"FRF_RAT_SDET"); + idf=7; + ipf=ap.ipf; + im0=ap.im0; + irat[0]=ap.irat[0]; + irat[1]=ap.irat[1]; + irat[2]=ap.irat[2]; + read_line("isp","%d",isp); + } + else + error_root(1,1,"read_force_parms2 [force_parms.c]", + "Unknown action"); + + error_root(ie!=0,1,"read_force_parms2 [force_parms.c]", + "Force and action types do not match"); + } + + if (NPROC>1) + { + MPI_Bcast(&idf,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ipf,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&im0,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(irat,3,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(imu,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(isp,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(ncr,4,MPI_INT,0,MPI_COMM_WORLD); + } + + set_force_parms(ifr,force[idf],ipf,im0,irat,imu,isp,ncr); +} + + +void print_force_parms(void) +{ + int my_rank,i; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if ((my_rank==0)&&(init==1)) + { + for (i=0;i +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +static hmc_parms_t hmc={0,0,0,0,NULL,0.0,NULL}; + + +hmc_parms_t set_hmc_parms(int nact,int *iact,int npf,int nmu, + double *mu,int nlv,double tau) +{ + int iprms[4],i,ie; + double dprms[1]; + + if (NPROC>1) + { + iprms[0]=nact; + iprms[1]=npf; + iprms[2]=nmu; + iprms[3]=nlv; + dprms[0]=tau; + + MPI_Bcast(iprms,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=nact)||(iprms[1]!=npf)||(iprms[2]!=nmu)|| + (iprms[3]!=nlv)||(dprms[0]!=tau),1, + "set_hmc_parms [hmc_parms.c]","Parameters are not global"); + + ie=0; + + for (i=0;i0)&&(npf!=hmc.npf),1,"set_hmc_parms [hmc_parms.c]", + "Number of pseudo-fermion fields may be set only once"); + + if (nact!=hmc.nact) + { + if (hmc.iact!=NULL) + { + free(hmc.iact); + hmc.iact=NULL; + } + + if (nact>0) + { + hmc.iact=malloc(nact*sizeof(int)); + error(hmc.iact==NULL,1,"set_hmc_parms [hmc_parms.c]", + "Unable to allocate parameter array"); + } + } + + if (nmu!=hmc.nmu) + { + if (hmc.mu!=NULL) + { + free(hmc.mu); + hmc.mu=NULL; + } + + if (nmu>0) + { + hmc.mu=malloc(nmu*sizeof(double)); + error(hmc.mu==NULL,2,"set_hmc_parms [hmc_parms.c]", + "Unable to allocate parameter array"); + } + } + + hmc.nact=nact; + hmc.npf=npf; + hmc.nmu=nmu; + hmc.nlv=nlv; + hmc.tau=tau; + + for (i=0;i0) + { + printf("mu ="); + for (i=0;i +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int flg_lat=0,flg_bc=0; +static lat_parms_t lat={0,0.0,1.0,0.0,NULL,NULL,1.0}; +static bc_parms_t bc={0,{1.0,1.0},{1.0,1.0},{{0.0,0.0,0.0},{0.0,0.0,0.0}}}; +static sw_parms_t sw={DBL_MAX,1.0,{1.0,1.0}}; +static tm_parms_t tm={0}; + + +lat_parms_t set_lat_parms(double beta,double c0, + int nk,double *kappa,double csw) +{ + int iprms[1],ik,ie; + double dprms[3],*k; + + if (flg_lat!=0) + return lat; + + error(flg_lat!=0,1,"set_lat_parms [lat_parms.c]", + "Attempt to reset the lattice parameters"); + + error(iup[0][0]!=0,1,"set_lat_parms [lat_parms.c]", + "Geometry arrays are already set"); + + if (NPROC>1) + { + iprms[0]=nk; + dprms[0]=beta; + dprms[1]=c0; + dprms[2]=csw; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,3,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=nk)||(dprms[0]!=beta)||(dprms[1]!=c0)||(dprms[2]!=csw),1, + "set_lat_parms [lat_parms.c]","Parameters are not global"); + } + + error_root(nk<0,1,"set_lat_parms [lat_parms.c]", + "Number of kappa values must be non-negative"); + + error_root(c0<=0.0,1,"set_lat_parms [lat_parms.c]", + "Parameter c0 must be positive"); + + if (nk>0) + { + k=malloc(2*nk*sizeof(*k)); + error(k==NULL,1,"set_lat_parms [lat_parms.c]", + "Unable to allocate parameter array"); + } + else + k=NULL; + + lat.kappa=k; + lat.m0=k+nk; + + for (ik=0;ik1)&&(nk>0)) + { + for (ik=0;ik=11) + printf("kappa[%2d] = %.*f\n",ik,IMAX(n,6),lat.kappa[ik]); + else + printf("kappa[%1d] = %.*f\n",ik,IMAX(n,6),lat.kappa[ik]); + } + + n=fdigits(lat.csw); + printf("csw = %.*f\n\n",IMAX(n,1),lat.csw); + } +} + + +void write_lat_parms(FILE *fdat) +{ + int my_rank,endian; + int iw,ik; + stdint_t istd[5]; + double dstd[4]; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + endian=endianness(); + + if (my_rank==0) + { + istd[0]=(stdint_t)(N0); + istd[1]=(stdint_t)(N1); + istd[2]=(stdint_t)(N2); + istd[3]=(stdint_t)(N3); + istd[4]=(stdint_t)(lat.nk); + + dstd[0]=lat.beta; + dstd[1]=lat.c0; + dstd[2]=lat.c1; + dstd[3]=lat.csw; + + if (endian==BIG_ENDIAN) + { + bswap_int(5,istd); + bswap_double(4,dstd); + } + + iw=fwrite(istd,sizeof(stdint_t),5,fdat); + iw+=fwrite(dstd,sizeof(double),4,fdat); + + for (ik=0;ik1) + { + iprms[0]=type; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + + error(iprms[0]!=type,1,"set_bc_parms [lat_parms.c]", + "Parameters are not global"); + + if ((type>=0)&&(type<3)) + { + dprms[0]=cG; + dprms[1]=cF; + + if (type==0) + { + dprms[2]=0.0; + dprms[3]=0.0; + dprms[4]=0.0; + dprms[5]=0.0; + } + else if (type==1) + { + dprms[2]=phi[0]; + dprms[3]=phi[1]; + dprms[4]=phi_prime[0]; + dprms[5]=phi_prime[1]; + } + else if (type==2) + { + dprms[2]=cG_prime; + dprms[3]=cF_prime; + dprms[4]=phi_prime[0]; + dprms[5]=phi_prime[1]; + } + + MPI_Bcast(dprms,6,MPI_DOUBLE,0,MPI_COMM_WORLD); + + ie=((dprms[0]!=cG)||(dprms[1]!=cF)); + + if (type==1) + { + ie|=((dprms[2]!=phi[0])||(dprms[3]!=phi[1])); + ie|=((dprms[4]!=phi_prime[0])||(dprms[5]!=phi_prime[1])); + } + else if (type==2) + { + ie|=((dprms[2]!=cG_prime)||(dprms[3]!=cF_prime)); + ie|=((dprms[4]!=phi_prime[0])||(dprms[5]!=phi_prime[1])); + } + + error(ie!=0,1,"set_bc_parms [lat_parms.c]","Parameters are not global"); + } + } + + error_root((type<0)||(type>3),1,"set_bc_parms [lat_parms.c]", + "Unknown type of boundary condition"); + + bc.type=type; + + if ((type>=0)&&(type<3)) + { + bc.cG[0]=cG; + bc.cF[0]=cF; + + if (type==0) + { + bc.cG[1]=cG; + bc.cF[1]=cF; + } + else if (type==1) + { + bc.cG[1]=cG; + bc.cF[1]=cF; + + bc.phi[0][0]=phi[0]; + bc.phi[0][1]=phi[1]; + bc.phi[0][2]=-phi[0]-phi[1]; + + bc.phi[1][0]=phi_prime[0]; + bc.phi[1][1]=phi_prime[1]; + bc.phi[1][2]=-phi_prime[0]-phi_prime[1]; + } + else if (type==2) + { + bc.cG[1]=cG_prime; + bc.cF[1]=cF_prime; + + bc.phi[1][0]=phi_prime[0]; + bc.phi[1][1]=phi_prime[1]; + bc.phi[1][2]=-phi_prime[0]-phi_prime[1]; + } + } + + flg_bc=1; + + return bc; +} + + +bc_parms_t bc_parms(void) +{ + return bc; +} + + +void print_bc_parms(void) +{ + int my_rank,n[3]; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + if (bc.type==0) + { + printf("Open boundary conditions\n"); + + n[0]=fdigits(bc.cG[0]); + printf("cG = %.*f\n",IMAX(n[0],1),bc.cG[0]); + n[0]=fdigits(bc.cF[0]); + printf("cF = %.*f\n\n",IMAX(n[0],1),bc.cF[0]); + } + else if (bc.type==1) + { + printf("SF boundary conditions\n"); + + n[0]=fdigits(bc.cG[0]); + printf("cG = %.*f\n",IMAX(n[0],1),bc.cG[0]); + n[0]=fdigits(bc.cF[0]); + printf("cF = %.*f\n",IMAX(n[0],1),bc.cF[0]); + + n[0]=fdigits(bc.phi[0][0]); + n[1]=fdigits(bc.phi[0][1]); + n[2]=fdigits(bc.phi[0][2]); + printf("phi = %.*f,%.*f,%.*f\n",IMAX(n[0],1),bc.phi[0][0], + IMAX(n[1],1),bc.phi[0][1],IMAX(n[2],1),bc.phi[0][2]); + + n[0]=fdigits(bc.phi[1][0]); + n[1]=fdigits(bc.phi[1][1]); + n[2]=fdigits(bc.phi[1][2]); + printf("phi' = %.*f,%.*f,%.*f\n\n",IMAX(n[0],1),bc.phi[1][0], + IMAX(n[1],1),bc.phi[1][1],IMAX(n[2],1),bc.phi[1][2]); + } + else if (bc.type==2) + { + printf("Open-SF boundary conditions\n"); + + n[0]=fdigits(bc.cG[0]); + printf("cG = %.*f\n",IMAX(n[0],1),bc.cG[0]); + n[0]=fdigits(bc.cF[0]); + printf("cF = %.*f\n",IMAX(n[0],1),bc.cF[0]); + + n[1]=fdigits(bc.cG[1]); + printf("cG' = %.*f\n",IMAX(n[1],1),bc.cG[1]); + n[1]=fdigits(bc.cF[1]); + printf("cF' = %.*f\n",IMAX(n[1],1),bc.cF[1]); + + n[0]=fdigits(bc.phi[1][0]); + n[1]=fdigits(bc.phi[1][1]); + n[2]=fdigits(bc.phi[1][2]); + printf("phi' = %.*f,%.*f,%.*f\n\n",IMAX(n[0],1),bc.phi[1][0], + IMAX(n[1],1),bc.phi[1][1],IMAX(n[2],1),bc.phi[1][2]); + } + else + printf("Periodic boundary conditions\n\n"); + } +} + + +void write_bc_parms(FILE *fdat) +{ + int my_rank,endian,iw; + stdint_t istd[1]; + double dstd[10]; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + endian=endianness(); + + if (my_rank==0) + { + istd[0]=(stdint_t)(bc.type); + + dstd[0]=bc.cG[0]; + dstd[1]=bc.cG[1]; + dstd[2]=bc.cF[0]; + dstd[3]=bc.cF[1]; + dstd[4]=bc.phi[0][0]; + dstd[5]=bc.phi[0][1]; + dstd[6]=bc.phi[0][2]; + dstd[7]=bc.phi[1][0]; + dstd[8]=bc.phi[1][1]; + dstd[9]=bc.phi[1][2]; + + if (endian==BIG_ENDIAN) + { + bswap_int(1,istd); + bswap_double(10,dstd); + } + + iw=fwrite(istd,sizeof(stdint_t),1,fdat); + iw+=fwrite(dstd,sizeof(double),10,fdat); + + error_root(iw!=11,1,"write_bc_parms [bc_parms.c]", + "Incorrect write count"); + } +} + + +void check_bc_parms(FILE *fdat) +{ + int my_rank,endian,ir,ie; + stdint_t istd[1]; + double dstd[10]; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + endian=endianness(); + + if (my_rank==0) + { + ir=fread(istd,sizeof(stdint_t),1,fdat); + ir+=fread(dstd,sizeof(double),10,fdat); + + if (endian==BIG_ENDIAN) + { + bswap_int(1,istd); + bswap_double(10,dstd); + } + + ie=0; + ie|=(istd[0]!=(stdint_t)(bc.type)); + + ie|=(dstd[0]!=bc.cG[0]); + ie|=(dstd[1]!=bc.cG[1]); + ie|=(dstd[2]!=bc.cF[0]); + ie|=(dstd[3]!=bc.cF[1]); + ie|=(dstd[4]!=bc.phi[0][0]); + ie|=(dstd[5]!=bc.phi[0][1]); + ie|=(dstd[6]!=bc.phi[0][2]); + ie|=(dstd[7]!=bc.phi[1][0]); + ie|=(dstd[8]!=bc.phi[1][1]); + ie|=(dstd[9]!=bc.phi[1][2]); + + error_root(ir!=11,1,"check_bc_parms [bc_parms.c]", + "Incorrect read count"); + + error_root(ie!=0,1,"check_bc_parms [bc_parms.c]", + "Parameters do not match"); + } +} + + +double sea_quark_mass(int im0) +{ + if ((im0>=0)&&(im01) + { + dprms[0]=m0; + + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error(dprms[0]!=m0,1, + "set_sw_parms [lat_parms.c]","Parameter is not global"); + } + + if (m0!=sw.m0) + { + set_flags(ERASED_SW); + set_flags(ERASED_SWD); + set_grid_flags(SAP_BLOCKS,ERASED_SW); + set_flags(ERASED_AWHAT); + } + + sw.m0=m0; + sw.csw=lat.csw; + sw.cF[0]=bc.cF[0]; + sw.cF[1]=bc.cF[1]; + + return sw; +} + + +sw_parms_t sw_parms(void) +{ + sw.csw=lat.csw; + sw.cF[0]=bc.cF[0]; + sw.cF[1]=bc.cF[1]; + + return sw; +} + + +tm_parms_t set_tm_parms(int eoflg) +{ + int iprms[1]; + + if (NPROC>1) + { + iprms[0]=eoflg; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + + error(iprms[0]!=eoflg,1, + "set_tm_parms [lat_parms.c]","Parameter is not global"); + } + + if (eoflg!=tm.eoflg) + set_flags(ERASED_AWHAT); + + tm.eoflg=eoflg; + + return tm; +} + + +tm_parms_t tm_parms(void) +{ + return tm; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/mdint_parms.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/mdint_parms.c new file mode 100644 index 0000000000000000000000000000000000000000..450fae18f1fe2955ed414d34894caec894613a20 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/mdint_parms.c @@ -0,0 +1,479 @@ + +/******************************************************************************* +* +* File mdint_parms.c +* +* Copyright (C) 2011, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Molecular-dynamics integrator data base +* +* The externally accessible functions are +* +* mdint_parms_t set_mdint_parms(int ilv,integrator_t integrator,double lambda, +* int nstep,int nfr,int *ifr) +* Sets the parameters of the molecular-dynamics integrator at level +* ilv and returns a structure containing them (see the notes). +* +* mdint_parms_t mdint_parms(int ilv) +* Returns a structure containing the parameters of the integrator at +* level ilv (see the notes). +* +* void read_mdint_parms(int ilv) +* On process 0, this program scans stdin for a line starting with the +* string "[Level ]" (after any number of blanks), where is +* the integer value passed by the argument. An error occurs if no such +* line or more than one is found. The lines +* +* integrator +* lambda +* nstep +* forces [] +* +* are then read using read_line() [utils/mutils.c]. The line tagged +* "lambda" is required only when the specified integrator is the 2nd +* order OMF integrator. The line tagged "forces" must contain the +* indices of the forces (separated by white space) that are to be +* integrated at this level. On exit, the data are entered in the data +* base by calling set_mdint_parms(ilv,...). +* +* void print_mdint_parms(void) +* Prints the parameters of the defined integrator levels to stdout +* on MPI process 0. +* +* void write_mdint_parms(FILE *fdat) +* Writes the parameters of the defined integrator levels to the file +* fdat on MPI process 0. +* +* void check_mdint_parms(FILE *fdat) +* Compares the parameters of the defined integrator levels with those +* stored on the file fdat on MPI process 0, assuming the latter were +* written to the file by the program write_mdint_parms(). +* +* Notes: +* +* A structure of type mdint_parms_t contains the parameters of a hierarchical +* molecular-dynamics integrator at a specified level (see update/README.mdint). +* Its elements are +* +* integrator Elementary integrator used. This parameter is an enum +* type with one of the following values: +* +* LPFR Leapfrog integrator +* +* OMF2 2nd order Omelyan-Mryglod-Folk integrator +* +* OMF4 4th order Omelyan-Mryglod-Folk integrator +* +* lambda Parameter of the 2nd order OMF integrator +* +* nstep Number of times the elementary integrator is applied +* at this level +* +* nfr Number of forces integrated at this level +* +* ifr Force indices ifr[i] (i=0,..,nfr-1) +* +* The parameter lambda is not used in the case of the leapfrog and the 4th +* order OMF integrator. Up to 32 integrator levels, labeled by an index +* ilv=0,1,..,31, can be specified. +* +* An example of valid section in an input file which can be read by calling +* read_mdint(3) is +* +* [Level 3] +* integrator OMF2 +* lambda 0.2 +* nstep 12 +* forces 2 4 5 +* +* In this case, there are three forces with index 2, 4 and 5. +* +* The programs set_mdint_parms() and read_mdint_parms() perform global +* operations and must be called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define MDINT_PARMS_C + +#include +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +#define ILVMAX 32 + +static int init=0; +static mdint_parms_t mdp[ILVMAX+1]={{INTEGRATORS,0.0,0,0,NULL}}; + + +static void init_mdp(void) +{ + int i; + + for (i=1;i<=ILVMAX;i++) + mdp[i]=mdp[0]; + + init=1; +} + + +static void alloc_ifr(int ilv,int nfr) +{ + int *ifr; + + if (mdp[ilv].nfr>0) + { + free(mdp[ilv].ifr); + mdp[ilv].nfr=0; + mdp[ilv].ifr=NULL; + } + + if (nfr>0) + { + ifr=malloc(nfr*sizeof(*ifr)); + error(ifr==NULL,1,"alloc_ifr [mdint_parms.c]", + "Unable to allocate index array"); + mdp[ilv].nfr=nfr; + mdp[ilv].ifr=ifr; + } +} + + +mdint_parms_t set_mdint_parms(int ilv,integrator_t integrator,double lambda, + int nstep,int nfr,int *ifr) +{ + int iprms[4],i,j,ie; + double dprms[1]; + + if (init==0) + init_mdp(); + + if (integrator!=OMF2) + lambda=0.0; + + if (NPROC>1) + { + iprms[0]=ilv; + iprms[1]=(int)(integrator); + iprms[2]=nstep; + iprms[3]=nfr; + dprms[0]=lambda; + + MPI_Bcast(iprms,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + ie=0; + ie|=(iprms[0]!=ilv); + ie|=(iprms[1]!=(int)(integrator)); + ie|=(iprms[2]!=nstep); + ie|=(iprms[3]!=nfr); + ie|=(dprms[0]!=lambda); + + for (i=0;i=ILVMAX); + ie|=(integrator==INTEGRATORS); + ie|=(nstep<1); + ie|=(nfr<1); + + for (i=0;i=0)&&(ilv0) + { + printf("Forces ="); + + for (j=0;jnmx) + nmx=nfr; + } + + istd=malloc((nmx+4)*sizeof(stdint_t)); + error_root(istd==NULL,1,"write_mdint_parms [mdint_parms.c]", + "Unable to allocate auxiliary array"); + + for (i=0;inmx) + nmx=nfr; + } + + istd=malloc((nmx+4)*sizeof(stdint_t)); + error_root(istd==NULL,1,"check_mdint_parms [mdint_parms.c]", + "Unable to allocate auxiliary array"); + + for (i=0;i]" (after any number of blanks), where is +* the integer value passed by the argument. An error occurs if no such +* line or more than one is found. The lines +* +* degree +* range +* +* are then read using read_line() [utils/mutils.c] and the data are +* entered into the data base by calling set_rat_parms(). +* +* void print_rat_parms(void) +* Prints the defined rational function parameter sets to stdout on MPI +* process 0. +* +* void write_rat_parms(FILE *fdat) +* Writes the defined rational function parameter sets to the file fdat +* on MPI process 0. +* +* void check_rat_parms(FILE *fdat) +* Compares the defined rational function parameter sets with those +* on the file fdat on MPI process 0, assuming the latter were written +* to the file by the program write_rat_parms(). +* +* Notes: +* +* Currently only Zolotorev rational functions are supported (see the modules +* ratfcts/zolotarev.c and ratfcts/ratfcts.c). The elements of a structure of +* type rat_parms_t are +* +* degree Degree of the rational function +* +* range[2] Lower and upper end of the approximation range (see +* ratfcts/ratfcts.c) +* +* Up to 32 parameter sets, labeled by an index irp=0,1,..,31, can be +* specified. Once a set is defined, it cannot be changed by calling +* set_rat_parms() again. Rational function parameters must be globally +* the same. +* +* Except for rat_parms(), the programs in this module perform global +* operations and must be called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define RAT_PARMS_C + +#include +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +#define IRPMAX 32 + +static int init=0; +static rat_parms_t rp[IRPMAX+1]={{0,{0.0,0.0}}}; + + +static void init_rp(void) +{ + int irp; + + for (irp=1;irp<=IRPMAX;irp++) + rp[irp]=rp[0]; + + init=1; +} + + +rat_parms_t set_rat_parms(int irp,int degree,double *range) +{ + int ie,iprms[2]; + double dprms[2]; + + if (init==0) + init_rp(); + + if (NPROC>1) + { + iprms[0]=irp; + iprms[1]=degree; + dprms[0]=range[0]; + dprms[1]=range[1]; + + MPI_Bcast(iprms,2,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + ie=0; + ie|=(iprms[0]!=irp); + ie|=(iprms[1]!=degree); + ie|=(dprms[0]!=range[0]); + ie|=(dprms[1]!=range[1]); + + error(ie!=0,1,"set_rat_parms [rat_parms.c]", + "Parameters are not global"); + } + + ie=0; + ie|=((irp<0)||(irp>=IRPMAX)); + ie|=(degree<1); + ie|=(range[0]>=range[1]); + ie|=(range[0]<=0.0); + + error_root(ie!=0,1,"set_rat_parms [rat_parms.c]", + "Parameters are out of range"); + + error_root(rp[irp].degree!=0,1,"set_rat_parms [rat_parms.c]", + "Attempt to reset an already specified parameter set"); + + rp[irp].degree=degree; + rp[irp].range[0]=range[0]; + rp[irp].range[1]=range[1]; + + return rp[irp]; +} + + +rat_parms_t rat_parms(int irp) +{ + if (init==0) + init_rp(); + + if ((irp>=0)&&(irp1) + { + MPI_Bcast(°ree,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(range,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + set_rat_parms(irp,degree,range); +} + + +void print_rat_parms(void) +{ + int my_rank,irp,n[2]; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if ((my_rank==0)&&(init==1)) + { + for (irp=0;irp]" (after any number of blanks), where +* is the integer value passed through the argument. An error occurs +* if no such line or more than one is found. The lines +* +* rwfact +* im0 +* nsrc +* irp +* mu [] +* np [] +* isp [] +* +* are then read using read_line() [utils/mutils.c] and the data are +* added to the data base by calling set_rw_parms(irw,...). Depending +* on the value of "rwfact", some lines are not read and can be omitted +* in the input file. The number of items on the lines with tag "mu", +* "np" and "isp" depends on the reweighting factor too (see the notes). +* +* void print_rw_parms(void) +* Prints the defined reweighting factor parameter sets to stdout on +* MPI process 0. +* +* void write_rw_parms(FILE *fdat) +* Writes the defined reweighting factor parameter sets to the file fdat +* on MPI process 0. +* +* void check_rw_parms(FILE *fdat) +* Compares the defined reweighting factor parameter sets with those +* on the file fdat on MPI process 0, assuming the latter were written +* to the file by the program write_rw_parms(). +* +* Notes: +* +* The elements of a structure of type rw_parms_t are: +* +* rwfact Reweighting factor program used. This parameter is an enum +* type with one of the following values: +* +* RWTM1 (program rwtm1() [update/rwtm.c]), +* +* RWTM1_EO (program rwtm1eo() [update/rwtmeo.c]), +* +* RWTM2 (program rwtm2() [update/rwtm.c]), +* +* RWTM2_EO (program rwtm2eo() [update/rwtmeo.c]), +* +* RWRAT (program rwrat() [update/rwrat.c]). +* +* im0 Index of the bare sea quark mass in the parameter data base +* (see flags/lat_parms.c). +* +* nsrc Number N of random source fields to be used for the stochastic +* estimation of the reweighting factor. If the latter is split +* into a product factors, N random fields are used for each of +* them. +* +* irp Rational function parameter set index. Only relevant if +* rwfact=RWRAT. +* +* nfct If rwfact=RWTM*: Number of Hasenbusch factors into which the +* reweighting factor is decomposed; +* If rwfact=RWRAT: Number of rational factors into which the +* rational function is decomposed. +* +* mu Array of twisted masses that define the Hasenbusch factors +* (nfct elements; 0 +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +#define IRWMAX 32 + +static int init=0; +static rwfact_t rwfact[]={RWTM1,RWTM1_EO,RWTM2,RWTM2_EO,RWRAT}; +static rw_parms_t rw[IRWMAX+1]={{RWFACTS,0,0,0,0,NULL,NULL,NULL}}; + + +static void init_rw(void) +{ + int irw; + + for (irw=1;irw<=IRWMAX;irw++) + rw[irw]=rw[0]; + + init=1; +} + + +rw_parms_t set_rw_parms(int irw,rwfact_t rwfact,int im0,int nsrc, + int irp,int nfct,double *mu,int *np,int *isp) +{ + int iprms[6],i,ie; + double dprms[1]; + + if (init==0) + init_rw(); + + error_root((rwfact!=RWTM1)&&(rwfact!=RWTM1_EO)&& + (rwfact!=RWTM2)&&(rwfact!=RWTM2_EO)&&(rwfact!=RWRAT),1, + "set_rw_parms [rw_parms.c]","Unknown type of reweighting factor"); + + if (rwfact!=RWRAT) + irp=0; + + if (NPROC>1) + { + iprms[0]=irw; + iprms[1]=(int)(rwfact); + iprms[2]=im0; + iprms[3]=nsrc; + iprms[4]=irp; + iprms[5]=nfct; + + MPI_Bcast(iprms,6,MPI_INT,0,MPI_COMM_WORLD); + + ie=0; + ie|=(iprms[0]!=irw); + ie|=(iprms[1]!=(int)(rwfact)); + ie|=(iprms[2]!=im0); + ie|=(iprms[3]!=nsrc); + ie|=(iprms[4]!=irp); + ie|=(iprms[5]!=nfct); + + error(ie!=0,1,"set_rw_parms [rw_parms.c]", + "Parameters are not global"); + } + + ie=0; + ie|=((irw<0)||(irw>=IRWMAX)); + ie|=(im0<0); + ie|=(nsrc<1); + ie|=(irp<0); + ie|=(nfct<1); + + error_root(ie!=0,1,"set_rw_parms [rw_parms.c]", + "Parameters are out of range"); + + if (NPROC>1) + { + if (rwfact!=RWRAT) + { + for (i=0;i=0)&&(irw1) + { + MPI_Bcast(&idr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&im0,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nsrc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&irp,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nfct,1,MPI_INT,0,MPI_COMM_WORLD); + } + + if (idr<4) + { + mu=malloc(nfct*sizeof(*mu)); + np=NULL; + isp=malloc(nfct*sizeof(*isp)); + error((mu==NULL)||(isp==NULL),1,"read_rw_parms [rw_parms.c]", + "Unable to allocated data arrays"); + } + else + { + mu=NULL; + np=malloc(2*nfct*sizeof(*np)); + isp=np+nfct; + error(np==NULL,1,"read_rw_parms [rw_parms.c]", + "Unable to allocated data arrays"); + } + + if (my_rank==0) + { + if (idr<4) + read_dprms("mu",nfct,mu); + else + read_iprms("np",nfct,np); + + n=count_tokens("isp"); + error_root(n<1,1,"read_rw_parms [rw_parms.c]", + "No data on the line with tag isp"); + + if (n>nfct) + n=nfct; + read_iprms("isp",n,isp); + + for (i=n;i1) + { + if (idr<4) + MPI_Bcast(mu,nfct,MPI_DOUBLE,0,MPI_COMM_WORLD); + else + MPI_Bcast(np,nfct,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(isp,nfct,MPI_INT,0,MPI_COMM_WORLD); + } + + set_rw_parms(irw,rwfact[idr],im0,nsrc,irp,nfct,mu,np,isp); + + if (idr<4) + { + free(mu); + free(isp); + } + else + free(np); +} + + +void print_rw_parms(void) +{ + int my_rank,irw,idr,nfct,n,i; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if ((my_rank==0)&&(init==1)) + { + for (irw=0;irw +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +static sap_parms_t sap={{0,0,0,0},0,0,0}; + + +static void check_block_size(int *bs) +{ + int n0,n1,n2,n3; + + error_root((bs[0]<4)||(bs[1]<4)||(bs[2]<4)||(bs[3]<4)|| + (bs[0]>L0)||(bs[1]>L1)||(bs[2]>L2)||(bs[3]>L3),1, + "check_block_size [sap_parms.c]", + "Block sizes are out of range"); + + error_root((bs[0]%2)||(bs[1]%2)||(bs[2]%2)||(bs[3]%2),1, + "check_block_size [sap_parms.c]", + "Block sizes must be even"); + + error_root((L0%bs[0])||(L1%bs[1])||(L2%bs[2])||(L3%bs[3]),1, + "check_block_size [sap_parms.c]", + "Blocks do not divide the local lattice"); + + n0=L0/bs[0]; + n1=L1/bs[1]; + n2=L2/bs[2]; + n3=L3/bs[3]; + + error_root(((NPROC0*n0)%2)||((NPROC1*n1)%2)|| + ((NPROC2*n2)%2)||((NPROC3*n3)%2),1, + "check_block_size [sap_parms.c]", + "There must be an even number of blocks in each direction"); + + error_root((n0*n1*n2*n3)%2,1, + "check_block_size [sap_parms.c]", + "The number of blocks in the local lattice must be even"); +} + + + +sap_parms_t set_sap_parms(int *bs,int isolv,int nmr,int ncy) +{ + int iprms[7]; + + if (NPROC>1) + { + iprms[0]=bs[0]; + iprms[1]=bs[1]; + iprms[2]=bs[2]; + iprms[3]=bs[3]; + iprms[4]=isolv; + iprms[5]=nmr; + iprms[6]=ncy; + + MPI_Bcast(iprms,7,MPI_INT,0,MPI_COMM_WORLD); + + error((iprms[0]!=bs[0])||(iprms[1]!=bs[1])||(iprms[2]!=bs[2])|| + (iprms[3]!=bs[3])||(iprms[4]!=isolv)||(iprms[5]!=nmr)|| + (iprms[6]!=ncy),1, + "set_sap_parms [sap_parms.c]","Parameters are not global"); + } + + if (sap.ncy>0) + { + error_root((bs[0]!=sap.bs[0])||(bs[1]!=sap.bs[1])|| + (bs[2]!=sap.bs[2])||(bs[3]!=sap.bs[3]),1, + "set_sap_parms [sap_parms.c]","bs[4] may be set only once"); + } + else + { + check_block_size(bs); + sap.bs[0]=bs[0]; + sap.bs[1]=bs[1]; + sap.bs[2]=bs[2]; + sap.bs[3]=bs[3]; + } + + error_root((isolv<0)||(isolv>1)||(nmr<1)||(ncy<1),1, + "set_sap_parms [sap_parms.c]", + "Improper value of isolv, nmr or ncy"); + + sap.isolv=isolv; + sap.nmr=nmr; + sap.ncy=ncy; + + return sap; +} + + +sap_parms_t sap_parms(void) +{ + return sap; +} + + +void print_sap_parms(int ipr) +{ + int my_rank; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + if (ipr) + { + printf("SAP parameters:\n"); + printf("bs = %d %d %d %d\n", + sap.bs[0],sap.bs[1],sap.bs[2],sap.bs[3]); + printf("isolv = %d\n",sap.isolv); + printf("nmr = %d\n",sap.nmr); + printf("ncy = %d\n\n",sap.ncy); + } + else + { + printf("SAP block size:\n"); + printf("bs = %d %d %d %d\n\n", + sap.bs[0],sap.bs[1],sap.bs[2],sap.bs[3]); + } + } +} + + +void write_sap_parms(FILE *fdat) +{ + int my_rank,endian; + int i,iw; + stdint_t istd[7]; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + endian=endianness(); + + if (my_rank==0) + { + for (i=0;i<4;i++) + istd[i]=(stdint_t)(sap.bs[i]); + + istd[4]=(stdint_t)(sap.isolv); + istd[5]=(stdint_t)(sap.nmr); + istd[6]=(stdint_t)(sap.ncy); + + if (endian==BIG_ENDIAN) + bswap_int(7,istd); + + iw=fwrite(istd,sizeof(stdint_t),7,fdat); + error_root(iw!=7,1,"write_sap_parms [sap_parms.c]", + "Incorrect write count"); + } +} + + +void check_sap_parms(FILE *fdat) +{ + int my_rank,endian; + int i,ir,ie; + stdint_t istd[7]; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + endian=endianness(); + + if (my_rank==0) + { + ir=fread(istd,sizeof(stdint_t),7,fdat); + error_root(ir!=7,1,"check_sap_parms [sap_parms.c]", + "Incorrect read count"); + + if (endian==BIG_ENDIAN) + bswap_int(7,istd); + + ie=0; + + for (i=0;i<4;i++) + ie|=(istd[i]!=(stdint_t)(sap.bs[i])); + + ie|=(istd[4]!=(stdint_t)(sap.isolv)); + ie|=(istd[5]!=(stdint_t)(sap.nmr)); + ie|=(istd[6]!=(stdint_t)(sap.ncy)); + + error_root(ie!=0,1,"check_sap_parms [sap_parms.c]", + "Parameters do not match"); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/solver_parms.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/solver_parms.c new file mode 100644 index 0000000000000000000000000000000000000000..fa83f91d4d3973dc25e75db311cf7fa5686bb550 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/solver_parms.c @@ -0,0 +1,433 @@ + +/******************************************************************************* +* +* File solver_parms.c +* +* Copyright (C) 2011, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Solver parameter data base +* +* The externally accessible functions are +* +* solver_parms_t set_solver_parms(int isp,solver_t solver, +* int nkv,int isolv,int nmr,int ncy, +* int nmx,double res) +* Sets the parameters in the solver parameter set number isp and returns +* a structure containing them (see the notes). +* +* solver_parms_t solver_parms(int isp) +* Returns a structure containing the solver parameter set number +* isp (see the notes). +* +* void read_solver_parms(int isp) +* On process 0, this program scans stdin for a line starting with the +* string "[Solver ]" (after any number of blanks), where is +* the integer value passed by the argument. An error occurs if no such +* line or more than one is found. The lines +* +* solver +* nkv +* isolv +* nmr +* ncy +* nmx +* res +* +* are then read one by one using read_line() [utils/mutils.c]. The +* lines with tags nkv,..,ncy may be absent in the case of the CGNE +* and MSCG solvers (see the notes). The data are then added to the +* data base by calling set_solver_parms(isp,...). +* +* void print_solver_parms(int *isap,int *idfl) +* Prints the parameters of the defined solvers to stdout on MPI +* process 0. On exit the flag isap is 1 or 0 depending on whether +* one of the solvers makes use of the Schwarz Alternating Procedure +* (SAP) or not. Similarly, the flag idfl is set 1 or 0 depending on +* whether deflation is used or not. On MPI processes other than 0, +* the program does nothing and sets isap and idfl to zero. +* +* void write_solver_parms(FILE *fdat) +* Writes the parameters of the defined solvers to the file fdat on +* MPI process 0. +* +* void check_solver_parms(FILE *fdat) +* Compares the parameters of the defined solvers with those stored +* on the file fdat on MPI process 0, assuming the latter were written +* to the file by the program write_solver_parms() (mismatches of the +* maximal solver iteration number are not considered to be an error). +* +* Notes: +* +* The elements of a structure of type solver_parms_t are +* +* solver Solver program used. This parameter is an enum type with +* one of the following values: +* +* CGNE Program tmcg() [forces/tmcg.c]. +* +* MSCG Program tmcgm() [forces/tmcgm.c]. +* +* SAP_GCR Program sap_gcr() [sap/sap_gcr.c]. +* +* DFL_SAP_GCR Program dfl_sap_gcr() [dfl/dfl_sap_gcr.c]. +* +* nkv Maximal number of Krylov vectors generated before the GCR +* algorithm is restarted if solver=*_GCR. +* +* isolv Block solver to be used if solver=*SAP_GCR (0: plain MinRes, +* 1: eo-preconditioned MinRes). +* +* nmr Number of block solver iterations if solver=*SAP_GCR. +* +* ncy Number of SAP cycles to be applied if solver=*SAP_GCR. +* +* nmx Maximal number of CG iterations if solver={CGNE,MSCG} or +* maximal total number of Krylov vectors that may be generated +* if solver={SAP_GCR,DFL_SAP_GCR}. +* +* res Desired maximal relative residue of the calculated solution. +* +* Depending on the solver, some parameters are not used. These are set to +* zero by the program set_solver_parms() independently of the values of +* the arguments. +* +* Up to 32 solver parameter sets, labeled by an index isp=0,1,..,31, can +* be specified. Once a set is specified, it cannot be changed by calling +* set_solver_parms() again. Solver parameters must be globally the same. +* +* Except for solver_parms(), the programs in this module perform global +* operations and must be called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define SOLVER_PARMS_C + +#include +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +#define ISPMAX 32 + +static int init=0; +static solver_t solver[]={CGNE,MSCG,SAP_GCR,DFL_SAP_GCR}; +static solver_parms_t sp[ISPMAX+1]={{SOLVERS,0,0,0,0,0,0.0}}; + + +static void init_sp(void) +{ + int i; + + for (i=1;i<=ISPMAX;i++) + sp[i]=sp[0]; + + init=1; +} + + +solver_parms_t set_solver_parms(int isp,solver_t solver, + int nkv,int isolv,int nmr,int ncy, + int nmx,double res) +{ + int ie,iprms[7]; + double dprms[1]; + + if (init==0) + init_sp(); + + if ((solver==CGNE)||(solver==MSCG)) + { + nkv=0; + isolv=0; + nmr=0; + ncy=0; + } + + if (NPROC>1) + { + iprms[0]=isp; + iprms[1]=(int)(solver); + iprms[2]=nkv; + iprms[3]=isolv; + iprms[4]=nmr; + iprms[5]=ncy; + iprms[6]=nmx; + dprms[0]=res; + + MPI_Bcast(iprms,7,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + ie=0; + ie|=(iprms[0]!=isp); + ie|=(iprms[1]!=(int)(solver)); + ie|=(iprms[2]!=nkv); + ie|=(iprms[3]!=isolv); + ie|=(iprms[4]!=nmr); + ie|=(iprms[5]!=ncy); + ie|=(iprms[6]!=nmx); + ie|=(dprms[0]!=res); + + error(ie!=0,1,"set_solver_parms [solver_parms.c]", + "Parameters are not global"); + } + + ie=0; + ie|=(isp<0)||(isp>=ISPMAX); + ie|=(solver==SOLVERS); + ie|=(nmx<1); + + if ((solver==SAP_GCR)||(solver==DFL_SAP_GCR)) + { + ie|=(isolv<0)||(isolv>1); + ie|=(nmr<1); + ie|=(ncy<1); + } + + error_root(ie!=0,1,"set_solver_parms [solver_parms.c]", + "Parameters are out of range"); + + error_root(sp[isp].solver!=SOLVERS,1,"set_solver_parms [solver_parms.c]", + "Attempt to reset an already specified solver parameter set"); + + sp[isp].solver=solver; + sp[isp].nkv=nkv; + sp[isp].isolv=isolv; + sp[isp].nmr=nmr; + sp[isp].ncy=ncy; + sp[isp].nmx=nmx; + sp[isp].res=res; + + return sp[isp]; +} + + +solver_parms_t solver_parms(int isp) +{ + if (init==0) + init_sp(); + + if ((isp>=0)&&(isp1) + { + MPI_Bcast(&ids,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&isolv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + set_solver_parms(isp,solver[ids],nkv,isolv,nmr,ncy,nmx,res); +} + + +void print_solver_parms(int *isap,int *idfl) +{ + int my_rank,i; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + (*isap)=0; + (*idfl)=0; + + if ((my_rank==0)&&(init==1)) + { + for (i=0;iexp(t*T^a)*U(x,mu)}. + +The generators are assumed to be normalized such that + + tr{T^a*T^b}=-(1/2)*delta^{ab}, + +but the 3x3 matrices F(x,mu)^a*T^a (summed over a) do not depend on this +convention. + + +Supported actions +----------------- + +- Gauge action. + Program: action0(). + Symbol: ACG. + Parameters: none. + +- One-parameter twisted-mass pseudo-fermion action. + Program: action1(). + Symbol: ACF_TM1. + Parameters: mu,ipf,isp [see force1.c], m0 [bare mass]. + +- One-parameter twisted-mass pseudo-fermion action with even-odd + preconditioning. + Program: action4(). + Symbol: ACF_TM1_EO. + Parameters: mu,ipf,isp [see force4.c], m0 [bare mass]. + +- One-parameter twisted-mass pseudo-fermion action with even-odd + preconditioning plus "small determinant" action. + Program: action4(). + Symbol: ACF_TM1_EO_SDET. + Parameters: mu,ipf,isp [see force4.c], m0 [bare mass]. + +- Two-parameter (Hasenbusch) twisted-mass pseudo-fermion action. + Program: action2(). + Symbol: ACF_TM2. + Parameters: mu0,mu1,ipf,isp [see force2.c], m0 [bare mass]. + +- Two-parameter (Hasenbusch) twisted-mass pseudo-fermion action + with even-odd preconditioning. + Program: action5(). + Symbol: ACF_TM2_EO. + Parameters: mu0,mu1,ipf,isp [see force5.c], m0 [bare mass]. + +- Rational function pseudo-fermion action. + Program: action3(). + Symbol: ACF_RAT. + Parameters: irat,ipf,isp [see force3.c], m0 [bare mass]. + +- Rational function pseudo-fermion action plus "small determinant" + action. + Program: action3(). + Symbol: ACF_RAT_SDET. + Parameters: irat,ipf,isp [see force3.c], m0 [bare mass]. + + +Associated forces +----------------- + +- Gauge force. + Program: force0(). + Symbol: FRG. + Parameters: none. + +- One-parameter twisted-mass pseudo-fermion force. + Program: force1(). + Symbol: FRF_TM1. + Parameters: mu,ipf,isp,icr [see force1.c], m0 [bare mass]. + +- One-parameter twisted-mass pseudo-fermion force with even-odd + preconditioning. + Program: force4(). + Symbol: FRF_TM1_EO. + Parameters: mu,ipf,isp,icr [see force4.c], m0 [bare mass]. + +- One-parameter twisted-mass pseudo-fermion force with even-odd + preconditioning plus "small determinant" force. + Program: force4(). + Symbol: FRF_TM1_EO_SDET. + Parameters: mu,ipf,isp,icr [see force4.c], m0 [bare mass]. + +- Two-parameter (Hasenbusch) twisted-mass pseudo-fermion force. + Program: force2(). + Symbol: FRF_TM2. + Parameters: mu0,mu1,ipf,isp,icr [see force2.c], m0 [bare mass]. + +- Two-parameter (Hasenbusch) twisted-mass pseudo-fermion force with + even-odd preconditioning. + Program: force5(). + Symbol: FRF_TM2_EO. + Parameters: mu0,mu1,ipf,isp,icr [see force5.c], m0 [bare mass]. + +- Rational function pseudo-fermion force. + Program: force3(). + Symbol: FRF_RAT. + Parameters: irat,ipf,isp [see force3.c], m0 [bare mass]. + +- Rational function pseudo-fermion plus "small determinant" force. + Program: force3(). + Symbol: FRF_RAT_SDET. + Parameters: irat,ipf,isp [see force3.c], m0 [bare mass]. + + +Pseudo-fermion fields +--------------------- + +Pseudo-fermion fields are allocated permanently at the start of the simulation +program. They are administered by the module mdflds/mdflds.c together with the +momentum and the force fields. + +The maximal number npf of pseudo-fermion fields is set together with the other +parameters of the HMC algorithm (see flags/hmc_parms.c). + + +Solver programs +--------------- + +The available solver programs for the Dirac equation are + +- Conjugate gradient algorithm for the normal Dirac equation. + Programs: tmcg() and tmcgeo() [see tmcg.c]. + Symbol: CGNE. + +- Multi-shift conjugate gradient algorithm for the normal even-odd + preconditioned Dirac equation. + Program: tmcgm() [see tmcgm.c]. + Symbol: MSCG. + +- SAP-preconditioned GCR algorithm for the Dirac equation. + Program: sap_gcr() [see sap_gcr.c]. + Symbol: SAP_GCR. + +- Deflated SAP-preconditioned GCR algorithm for the Dirac equation. + Program: dfl_sap_gcr() and dfl_sap_gcr2() [see dfl_sap_gcr.c]. + Symbol: DFL_SAP_GCR. + +A particular solver is thus described by the solver symbol, the values of the +program arguments and further parameters (the bare quark mass, the parameters +of the SAP preconditioner and those related to the deflation subspace). + + +Chronological solver +-------------------- + +The force programs force1() and force2() can be instructed to propagate the +solutions of the Dirac equation along the molecular-dynamics trajectories. The +stacks of previous solutions are handled by the module chrono.c. + + +Action, force and solver data base +---------------------------------- + +The parameters of the actions, forces and solvers used in a simulation are +stored in a data base. At the beginning of the simulation program, the list of +all actions, forces and solvers must be defined. These data are then entered +in the data base using the utility programs in the flags module directory (see +action_parms.c, force_parms.c and solver_parms.c). + + +Rational function data base +--------------------------- + +For the charm and the strange quark, a version of the RHMC algorithm is used. +The basic rational functions are [n,n] Zolotarev rational functions, but in +the simulation programs it is advantageous to split these into a few rational +functions of lower degree and to use a pseudo-fermion action for each of them. + +The data base for rational functions consists of two parts, one for the +parameters of the basic Zolotarev rational functions (flags/rat_parms.c) and +the other for the rational functions that occur in the pseudo-fermion actions +(see ratfcts/ratfcts.c). diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force0.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force0.c new file mode 100644 index 0000000000000000000000000000000000000000..6e84e5d199c36447e93e9d82bc480c9f29bd3535 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force0.c @@ -0,0 +1,712 @@ + +/******************************************************************************* +* +* File force0.c +* +* Copyright (C) 2005, 2009-2014 Martin Luescher, John Bulava +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Action of the double-precision gauge field and associated force. +* +* The externally accessible functions are +* +* void plaq_frc(void) +* Computes the force deriving from the Wilson plaquette action, +* omitting the prefactor 1/g0^2, and assigns the result to the MD +* force field. In the case of open, SF or open-SF boundary conditions, +* the boundary improvement coefficients are set to their tree-level +* value independently of the values stored in the parameter data base. +* +* void force0(double c) +* Computes the force deriving from the gauge action, including the +* prefactor 1/g0^2, multiplies the calculated force by c and assigns +* the result to the MD force field. The coupling g0 and the other +* parameters of the gauge action are retrieved from the parameter +* data base. +* +* double action0(int icom) +* Computes the local part of the gauge action including the prefactor +* 1/g0^2. The coupling g0 and the other parameters of the action are +* retrieved from the parameter data base. The program returns the sum +* of the local parts of the action over all MPI processes if icom=1 +* and otherwise just the local part. +* +* Notes: +* +* See the notes doc/gauge_action.pdf for the definition of the gauge action +* and a description of the computation of the force deriving from it. The +* molecular-dynamics (MD) force field is the one returned by the program +* mdflds() (see mdflds/mdflds.c). +* +* On the links in the local lattice where the static link variables reside, +* the programs plaq_frc() and force0() set the force field to zero. +* +* The programs in this module perform global communications and must be +* called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define FORCE0_C + +#include +#include +#include +#include "mpi.h" +#include "flags.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define MAX_LEVELS 8 +#define BLK_LENGTH 8 + +static const int plns[6][2]={{0,1},{0,2},{0,3},{2,3},{3,1},{1,2}}; +static int nfc[8],ofs[8],hofs[8],cnt[MAX_LEVELS],init=0; +static double smx[MAX_LEVELS]; +static su3_dble *udb,*hdb; +static su3_dble wd[3],vd[4] ALIGNED16; +static su3_alg_dble X ALIGNED16; + + +static void set_ofs(void) +{ + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=VOLUME; + ofs[1]=ofs[0]+(FACE0/2); + ofs[2]=ofs[1]+(FACE0/2); + ofs[3]=ofs[2]+(FACE1/2); + ofs[4]=ofs[3]+(FACE1/2); + ofs[5]=ofs[4]+(FACE2/2); + ofs[6]=ofs[5]+(FACE2/2); + ofs[7]=ofs[6]+(FACE3/2); + + hofs[0]=0; + hofs[1]=hofs[0]+3*FACE0; + hofs[2]=hofs[1]+3*FACE0; + hofs[3]=hofs[2]+3*FACE1; + hofs[4]=hofs[3]+3*FACE1; + hofs[5]=hofs[4]+3*FACE2; + hofs[6]=hofs[5]+3*FACE2; + hofs[7]=hofs[6]+3*FACE3; + + init=1; +} + + +static void set_staples(int n,int ix,int ia) +{ + int mu,nu,ifc; + int iy,ib,ip[4]; + + mu=plns[n][0]; + nu=plns[n][1]; + + if (!ia) + { + iy=idn[ix][nu]; + + if (iynu)]; + } + } + + iy=iup[ix][mu]; + + if (iymu)]; + } + + if (!ia) + { + iy=idn[ix][mu]; + + if (iymu)]; + } + } + + iy=iup[ix][nu]; + + if (iynu)]; + } +} + + +void plaq_frc(void) +{ + int bc,n,ix,t,ip[4]; + double r; + su3_alg_dble *fdb; + mdflds_t *mdfs; + + if (query_flags(UDBUF_UP2DATE)!=1) + copy_bnd_ud(); + + bc=bc_type(); + udb=udfld(); + mdfs=mdflds(); + fdb=(*mdfs).frc; + set_frc2zero(); + + for (ix=0;ix0)||(bc!=1)) + { + _su3_alg_sub_assign(*(fdb+ip[2]),X); + } + } + } + + if ((t>0)||(bc!=1)) + { + r=1.0; + + if (((t==0)&&(bc!=3))||((t==(N0-1))&&(bc==0))) + r=0.5; + + for (n=3;n<6;n++) + { + plaq_uidx(n,ix,ip); + + su3xsu3dag(udb+ip[1],udb+ip[3],wd); + su3dagxsu3(udb+ip[2],udb+ip[0],wd+1); + prod2su3alg(wd,wd+1,&X); + _su3_alg_mul_add_assign(*(fdb+ip[1]),r,X); + prod2su3alg(wd+1,wd,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[3]),r,X); + + su3xsu3dag(wd,udb+ip[2],wd+1); + prod2su3alg(udb+ip[0],wd+1,&X); + _su3_alg_mul_add_assign(*(fdb+ip[0]),r,X); + _su3_alg_mul_sub_assign(*(fdb+ip[2]),r,X); + } + } + } + + add_bnd_frc(); +} + + +void force0(double c) +{ + int bc,n,ix,t,ip[4]; + double c0,c1,*cG; + double r0,r1; + su3_alg_dble *fdb; + mdflds_t *mdfs; + lat_parms_t lat; + bc_parms_t bcp; + + lat=lat_parms(); + c*=(lat.beta/6.0); + c0=lat.c0; + c1=lat.c1; + + bcp=bc_parms(); + bc=bcp.type; + cG=bcp.cG; + + if (query_flags(UDBUF_UP2DATE)!=1) + copy_bnd_ud(); + + udb=udfld(); + mdfs=mdflds(); + fdb=(*mdfs).frc; + set_frc2zero(); + + if (c0==1.0) + hdb=NULL; + else + { + if (init==0) + set_ofs(); + + if (query_flags(BSTAP_UP2DATE)!=1) + set_bstap(); + hdb=bstap(); + } + + for (ix=0;ix0)||(bc!=1)) + { + _su3_alg_mul_sub_assign(*(fdb+ip[2]),r0,X); + } + + if (c0!=1.0) + { + set_staples(n,ix,0); + + if ((t==0)&&(bc==1)) + { + su3xsu3(wd+1,udb+ip[0],wd+2); + su3xsu3(udb+ip[0],wd+2,wd+2); + + prod2su3alg(wd+1,wd+2,&X); + _su3_alg_mul_add_assign(*(fdb+ip[1]),r1,X); + + prod2su3alg(wd+2,wd+1,&X); + _su3_alg_mul_add_assign(*(fdb+ip[0]),r1,X); + + su3dagxsu3(udb+ip[2],wd+2,wd+2); + + prod2su3alg(wd+2,wd,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[3]),r1,X); + } + + if ((t==(N0-1))&&(bc!=3)) + { + su3xsu3(wd+1,udb+ip[0],wd+2); + su3xsu3(udb+ip[0],wd+2,wd+2); + + prod2su3alg(wd+2,wd+1,&X); + _su3_alg_mul_add_assign(*(fdb+ip[0]),r1,X); + _su3_alg_mul_sub_assign(*(fdb+ip[2]),r1,X); + + su3dagxsu3(udb+ip[2],wd+2,wd+2); + + prod2su3alg(wd+2,wd,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[3]),r1,X); + } + + if ((t<(N0-1))||(bc==3)) + { + prod2su3alg(wd+1,vd,&X); + _su3_alg_mul_add_assign(*(fdb+ip[1]),r1,X); + } + + if ((t>0)||(bc!=1)) + { + prod2su3alg(vd,wd+1,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[2]),r1,X); + } + + su3dagxsu3(udb+ip[2],vd,wd+1); + prod2su3alg(wd+1,wd,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[3]),r1,X); + + if ((t<(N0-2))||((t==(N0-2))&&(bc!=0))||(bc==3)) + { + su3xsu3dag(udb+ip[3],vd+1,wd+1); + su3xsu3dag(wd+1,udb+ip[0],wd+2); + prod2su3alg(udb+ip[2],wd+2,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[0]),r1,X); + + if ((t>0)||(bc!=1)) + { + _su3_alg_mul_add_assign(*(fdb+ip[2]),r1,X); + } + + prod2su3alg(wd+2,udb+ip[2],&X); + _su3_alg_mul_add_assign(*(fdb+ip[3]),r1,X); + } + + if ((t>0)||(bc==3)) + { + su3xsu3dag(wd,vd+2,wd+1); + prod2su3alg(udb+ip[0],wd+1,&X); + _su3_alg_mul_add_assign(*(fdb+ip[0]),r1,X); + + if ((t<(N0-1))||(bc==3)) + { + prod2su3alg(wd+1,udb+ip[0],&X); + _su3_alg_mul_add_assign(*(fdb+ip[1]),r1,X); + } + + su3dagxsu3(vd+2,udb+ip[0],wd+1); + prod2su3alg(wd+1,wd,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[3]),r1,X); + } + + su3xsu3dag(udb+ip[1],vd+3,wd); + su3xsu3dag(wd,udb+ip[2],wd+1); + prod2su3alg(udb+ip[0],wd+1,&X); + _su3_alg_mul_add_assign(*(fdb+ip[0]),r1,X); + + if ((t>0)||(bc!=1)) + { + _su3_alg_mul_sub_assign(*(fdb+ip[2]),r1,X); + } + + if ((t<(N0-1))||(bc==3)) + { + prod2su3alg(wd+1,udb+ip[0],&X); + _su3_alg_mul_add_assign(*(fdb+ip[1]),r1,X); + } + } + } + } + + if ((t>0)||(bc!=1)) + { + r0=c*c0; + r1=c*c1; + + if ((t==0)&&(bc!=3)) + { + r0*=(0.5*cG[0]); + r1*=(0.5*cG[0]); + } + else if ((t==(N0-1))&&(bc==0)) + { + r0*=(0.5*cG[1]); + r1*=(0.5*cG[1]); + } + + for (n=3;n<6;n++) + { + plaq_uidx(n,ix,ip); + + su3xsu3dag(udb+ip[1],udb+ip[3],wd); + su3dagxsu3(udb+ip[2],udb+ip[0],wd+1); + prod2su3alg(wd,wd+1,&X); + _su3_alg_mul_add_assign(*(fdb+ip[1]),r0,X); + + prod2su3alg(wd+1,wd,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[3]),r0,X); + + su3xsu3dag(wd,udb+ip[2],wd+1); + prod2su3alg(udb+ip[0],wd+1,&X); + _su3_alg_mul_add_assign(*(fdb+ip[0]),r0,X); + _su3_alg_mul_sub_assign(*(fdb+ip[2]),r0,X); + + if (c0!=1.0) + { + set_staples(n,ix,0); + + prod2su3alg(wd+1,vd,&X); + _su3_alg_mul_add_assign(*(fdb+ip[1]),r1,X); + + prod2su3alg(vd,wd+1,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[2]),r1,X); + + su3dagxsu3(udb+ip[2],vd,wd+1); + prod2su3alg(wd+1,wd,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[3]),r1,X); + + su3xsu3dag(udb+ip[3],vd+1,wd+1); + su3xsu3dag(wd+1,udb+ip[0],wd+2); + prod2su3alg(udb+ip[2],wd+2,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[0]),r1,X); + _su3_alg_mul_add_assign(*(fdb+ip[2]),r1,X); + + prod2su3alg(wd+2,udb+ip[2],&X); + _su3_alg_mul_add_assign(*(fdb+ip[3]),r1,X); + + su3xsu3dag(wd,vd+2,wd+1); + prod2su3alg(udb+ip[0],wd+1,&X); + _su3_alg_mul_add_assign(*(fdb+ip[0]),r1,X); + + prod2su3alg(wd+1,udb+ip[0],&X); + _su3_alg_mul_add_assign(*(fdb+ip[1]),r1,X); + + su3dagxsu3(vd+2,udb+ip[0],wd+1); + prod2su3alg(wd+1,wd,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[3]),r1,X); + + su3xsu3dag(udb+ip[1],vd+3,wd); + su3xsu3dag(wd,udb+ip[2],wd+1); + prod2su3alg(udb+ip[0],wd+1,&X); + _su3_alg_mul_add_assign(*(fdb+ip[0]),r1,X); + _su3_alg_mul_sub_assign(*(fdb+ip[2]),r1,X); + + prod2su3alg(wd+1,udb+ip[0],&X); + _su3_alg_mul_add_assign(*(fdb+ip[1]),r1,X); + } + } + } + } + + add_bnd_frc(); +} + + +static void wloops(int n,int ix,int t,double c0,double *trU) +{ + int bc,ip[4]; + + bc=bc_type(); + plaq_uidx(n,ix,ip); + + trU[0]=0.0; + trU[1]=0.0; + trU[2]=0.0; + trU[3]=0.0; + + if ((n>=3)||(t<(N0-1))||(bc!=0)) + { + su3dagxsu3(udb+ip[2],udb+ip[0],wd); + su3xsu3dag(udb+ip[1],udb+ip[3],wd+1); + cm3x3_retr(wd,wd+1,trU); + trU[0]=3.0-trU[0]; + } + + if (c0!=1.0) + { + set_staples(n,ix,1); + + if ((n<3)&&(((t==0)&&(bc==1))|| + ((t==(N0-1))&&((bc==1)||(bc==2))))) + { + su3xsu3(wd,wd+1,wd+1); + cm3x3_retr(wd+1,wd+1,trU+3); + trU[3]=3.0-trU[3]; + } + + if ((n>=3)||(t<(N0-1))||(bc!=0)) + { + su3xsu3dag(udb+ip[1],vd+3,wd+1); + cm3x3_retr(wd,wd+1,trU+1); + trU[1]=3.0-trU[1]; + } + + if ((n>=3)||(t<(N0-2))||((t==(N0-2))&&(bc!=0))||(bc==3)) + { + su3xsu3dag(vd+1,udb+ip[3],wd+1); + cm3x3_retr(wd,wd+1,trU+2); + trU[2]=3.0-trU[2]; + } + } +} + + +double action0(int icom) +{ + int bc,n,ix,t; + double c0,c1,*cG; + double r0,r1,trU[4],act; + lat_parms_t lat; + bc_parms_t bcp; + + lat=lat_parms(); + c0=lat.c0; + c1=lat.c1; + + bcp=bc_parms(); + bc=bcp.type; + cG=bcp.cG; + + if (query_flags(UDBUF_UP2DATE)!=1) + copy_bnd_ud(); + udb=udfld(); + + if (c0==1.0) + hdb=NULL; + else + { + if (init==0) + set_ofs(); + + if (query_flags(BSTAP_UP2DATE)!=1) + set_bstap(); + hdb=bstap(); + } + + for (n=0;n0)||(bc!=1)) + { + r0=c0; + r1=c1; + + if ((t==0)&&(bc!=3)) + { + r0*=(0.5*cG[0]); + r1*=(0.5*cG[0]); + } + else if ((t==(N0-1))&&(bc==0)) + { + r0*=(0.5*cG[1]); + r1*=(0.5*cG[1]); + } + + for (n=3;n<6;n++) + { + wloops(n,ix,t,c0,trU); + act+=(r0*trU[0]+r1*(trU[1]+trU[2])); + } + } + + cnt[0]+=1; + smx[0]+=act; + + for (n=1;(cnt[n-1]>=BLK_LENGTH)&&(n0) 2+2*(icr>0) 2+2*(icr>0) +* action1() 1 1 1 +* +* (these figures do not include the workspace required by the solvers). +* +* The programs in this module perform global communications and must be +* called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define FORCE1_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "mdflds.h" +#include "sw_term.h" +#include "sflds.h" +#include "dirac.h" +#include "linalg.h" +#include "sap.h" +#include "dfl.h" +#include "update.h" +#include "forces.h" +#include "global.h" + + +double setpf1(double mu,int ipf,int icom) +{ + double act; + spinor_dble **wsd,*phi; + mdflds_t *mdfs; + tm_parms_t tm; + + tm=tm_parms(); + if (tm.eoflg==1) + set_tm_parms(0); + + wsd=reserve_wsd(1); + random_sd(VOLUME,wsd[0],1.0); + bnd_sd2zero(ALL_PTS,wsd[0]); + act=norm_square_dble(VOLUME,icom,wsd[0]); + + sw_term(NO_PTS); + + mdfs=mdflds(); + phi=(*mdfs).pf[ipf]; + Dw_dble(mu,wsd[0],phi); + mulg5_dble(VOLUME,phi); + release_wsd(); + + return act; +} + + +void force1(double mu,int ipf,int isp,int icr,double c,int *status) +{ + int l; + double res0,res1; + spinor_dble *phi,*chi,*psi,**wsd; + spinor_dble *rho,*eta,**rsd; + mdflds_t *mdfs; + solver_parms_t sp; + sap_parms_t sap; + tm_parms_t tm; + + tm=tm_parms(); + if (tm.eoflg==1) + set_tm_parms(0); + + mdfs=mdflds(); + sp=solver_parms(isp); + sw_term(NO_PTS); + + wsd=reserve_wsd(2); + phi=(*mdfs).pf[ipf]; + psi=wsd[0]; + chi=wsd[1]; + + if (sp.solver==CGNE) + { + if (get_chrono(icr,chi)) + { + rsd=reserve_wsd(1); + rho=rsd[0]; + + Dw_dble(-mu,chi,psi); + mulg5_dble(VOLUME,psi); + Dw_dble(mu,psi,rho); + mulg5_dble(VOLUME,rho); + mulr_spinor_add_dble(VOLUME,rho,phi,-1.0); + + res0=norm_square_dble(VOLUME,1,phi); + res1=norm_square_dble(VOLUME,1,rho); + res1=sqrt(res1/res0); + + if (res1<1.0) + { + if (res1>sp.res) + { + tmcg(sp.nmx,sp.res/res1,mu,rho,psi,status); + mulr_spinor_add_dble(VOLUME,chi,psi,-1.0); + } + else + status[0]=0; + } + else + tmcg(sp.nmx,sp.res,mu,phi,chi,status); + + release_wsd(); + } + else + tmcg(sp.nmx,sp.res,mu,phi,chi,status); + + error_root(status[0]<0,1,"force1 [force1.c]", + "CGNE solver failed (mu = %.4e, parameter set no %d, " + "status = %d)",mu,isp,status[0]); + if (icr) + add_chrono(icr,chi); + Dw_dble(-mu,chi,psi); + mulg5_dble(VOLUME,psi); + } + else if (sp.solver==SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + if (get_chrono(icr,chi)) + { + rsd=reserve_wsd(2); + rho=rsd[0]; + eta=rsd[1]; + + Dw_dble(-mu,chi,psi); + mulg5_dble(VOLUME,psi); + Dw_dble(mu,psi,rho); + mulg5_dble(VOLUME,rho); + mulr_spinor_add_dble(VOLUME,rho,phi,-1.0); + + res0=norm_square_dble(VOLUME,1,phi); + res1=norm_square_dble(VOLUME,1,rho); + res1=sqrt(res1/res0); + + if (res1<1.0) + { + if (res1>sp.res) + { + mulg5_dble(VOLUME,rho); + sap_gcr(sp.nkv,sp.nmx,sp.res/res1,mu,rho,eta,status); + mulr_spinor_add_dble(VOLUME,psi,eta,-1.0); + + res0=norm_square_dble(VOLUME,1,psi); + res1=norm_square_dble(VOLUME,1,eta); + res1=sqrt(res1/res0); + + if (res1<1.0) + { + if (res1>sp.res) + { + mulg5_dble(VOLUME,eta); + sap_gcr(sp.nkv,sp.nmx,sp.res/res1,-mu,eta,rho,status+1); + mulr_spinor_add_dble(VOLUME,chi,rho,-1.0); + } + else + status[1]=0; + } + else + { + mulg5_dble(VOLUME,psi); + sap_gcr(sp.nkv,sp.nmx,sp.res,-mu,psi,chi,status+1); + mulg5_dble(VOLUME,psi); + } + } + else + { + status[0]=0; + status[1]=0; + } + } + else + { + mulg5_dble(VOLUME,phi); + sap_gcr(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + mulg5_dble(VOLUME,phi); + mulg5_dble(VOLUME,psi); + sap_gcr(sp.nkv,sp.nmx,sp.res,-mu,psi,chi,status+1); + mulg5_dble(VOLUME,psi); + } + + release_wsd(); + } + else + { + mulg5_dble(VOLUME,phi); + sap_gcr(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + mulg5_dble(VOLUME,phi); + mulg5_dble(VOLUME,psi); + sap_gcr(sp.nkv,sp.nmx,sp.res,-mu,psi,chi,status+1); + mulg5_dble(VOLUME,psi); + } + + error_root((status[0]<0)||(status[1]<0),1,"force1 [force1.c]", + "SAP_GCR solver failed (mu = %.4e, parameter set no %d, " + "status = %d;%d)",mu,isp,status[0],status[1]); + if (icr) + add_chrono(icr,chi); + } + else if (sp.solver==DFL_SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + if (get_chrono(icr,chi)) + { + rsd=reserve_wsd(2); + rho=rsd[0]; + eta=rsd[1]; + + Dw_dble(-mu,chi,psi); + mulg5_dble(VOLUME,psi); + Dw_dble(mu,psi,rho); + mulg5_dble(VOLUME,rho); + mulr_spinor_add_dble(VOLUME,rho,phi,-1.0); + + res0=norm_square_dble(VOLUME,1,phi); + res1=norm_square_dble(VOLUME,1,rho); + res1=sqrt(res1/res0); + + if (res1<1.0) + { + if (res1>sp.res) + { + mulg5_dble(VOLUME,rho); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res/res1,mu,rho,eta,status); + mulr_spinor_add_dble(VOLUME,psi,eta,-1.0); + + res0=norm_square_dble(VOLUME,1,psi); + res1=norm_square_dble(VOLUME,1,eta); + res1=sqrt(res1/res0); + + if (res1<1.0) + { + if (res1>sp.res) + { + mulg5_dble(VOLUME,eta); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res/res1,-mu,eta,rho, + status+3); + mulr_spinor_add_dble(VOLUME,chi,rho,-1.0); + } + else + { + for (l=3;l<6;l++) + status[l]=0; + } + } + else + { + mulg5_dble(VOLUME,psi); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,-mu,psi,chi,status+3); + mulg5_dble(VOLUME,psi); + } + } + else + { + for (l=0;l<6;l++) + status[l]=0; + } + } + else + { + mulg5_dble(VOLUME,phi); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + mulg5_dble(VOLUME,phi); + mulg5_dble(VOLUME,psi); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,-mu,psi,chi,status+3); + mulg5_dble(VOLUME,psi); + } + + release_wsd(); + } + else + { + mulg5_dble(VOLUME,phi); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + mulg5_dble(VOLUME,phi); + mulg5_dble(VOLUME,psi); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,-mu,psi,chi,status+3); + mulg5_dble(VOLUME,psi); + } + + error_root((status[0]<0)||(status[1]<0)||(status[3]<0)||(status[4]<0),1, + "force1 [force1.c]","DFL_SAP_GCR solver failed " + "(mu = %.4e, parameter set no %d, status = %d,%d,%d;%d,%d,%d)", + mu,isp,status[0],status[1],status[2], + status[3],status[4],status[5]); + + if (icr) + add_chrono(icr,chi); + } + else + error_root(1,1,"force1 [force1.c]","Unknown solver"); + + set_xt2zero(); + add_prod2xt(1.0,chi,psi); + sw_frc(c); + + set_xv2zero(); + add_prod2xv(1.0,chi,psi); + hop_frc(c); + + release_wsd(); +} + + +double action1(double mu,int ipf,int isp,int icom,int *status) +{ + double act; + spinor_dble *phi,*psi,**wsd,**rsd; + mdflds_t *mdfs; + solver_parms_t sp; + sap_parms_t sap; + tm_parms_t tm; + + tm=tm_parms(); + if (tm.eoflg==1) + set_tm_parms(0); + + mdfs=mdflds(); + sp=solver_parms(isp); + + wsd=reserve_wsd(1); + psi=wsd[0]; + phi=(*mdfs).pf[ipf]; + + if (sp.solver==CGNE) + { + tmcg(sp.nmx,sp.res,mu,phi,psi,status); + + error_root(status[0]<0,1,"action1 [force1.c]", + "CGNE solver failed (mu = %.4e, parameter set no %d, " + "status = %d)",mu,isp,status[0]); + + rsd=reserve_wsd(1); + Dw_dble(-mu,psi,rsd[0]); + act=norm_square_dble(VOLUME,icom,rsd[0]); + release_wsd(); + } + else if (sp.solver==SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME,phi); + sap_gcr(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + mulg5_dble(VOLUME,phi); + + error_root(status[0]<0,1,"action1 [force1.c]", + "SAP_GCR solver failed (mu = %.4e, parameter set no %d, " + "status = %d)",mu,isp,status[0]); + + act=norm_square_dble(VOLUME,icom,psi); + } + else if (sp.solver==DFL_SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME,phi); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + mulg5_dble(VOLUME,phi); + + error_root((status[0]<0)||(status[1]<0),1, + "action1 [force1.c]","DFL_SAP_GCR solver failed " + "(mu = %.4e, parameter set no %d, status = %d,%d,%d)", + mu,isp,status[0],status[1],status[2]); + + act=norm_square_dble(VOLUME,icom,psi); + } + else + { + error_root(1,1,"action1 [force1.c]","Unknown solver"); + act=0.0; + } + + release_wsd(); + + return act; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force2.c new file mode 100644 index 0000000000000000000000000000000000000000..53c37312534d5127cec2856b9ee08dacabd7fb8f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force2.c @@ -0,0 +1,218 @@ + +/******************************************************************************* +* +* File force2.c +* +* Copyright (C) 2011-2013 Stefan Schaefer, Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Hasenbusch twisted_mass pseudo-fermion action and force. +* +* The externally accessible functions are +* +* double setpf2(double mu0,double mu1,int ipf,int isp,int icom, +* int *status) +* Generates a pseudo-fermion field phi with probability proportional +* to exp(-Spf) and returns the action Spf-(phi,phi) (see the notes). +* +* void force2(double mu0,int mu1,int ipf,int isp,int icr,double c, +* int *status) +* Computes the force deriving from the action Spf (see the notes). +* The calculated force is multiplied by c and added to the molecular- +* dynamics force field. +* +* double action2(double mu0,double mu1,int ipf,int isp,int icom, +* int *status) +* Returns the action Spf-(phi,phi) (see the notes). +* +* Notes: +* +* The pseudo-fermion action Spf is given by +* +* Spf=(phi,(Dw^dag*Dw+mu1^2)(Dw^dag*Dw+mu0^2)^(-1)*phi) +* +* =(phi,phi)+(mu1^2-mu0^2)*(phi,(Dw^dag*Dw+mu0^2)^(-1)*phi) +* +* where Dw denotes the (improved) Wilson-Dirac operator and phi the pseudo- +* fermion field. +* +* The common parameters of the programs in this module are: +* +* mu0,mu1 Twisted mass parameters in Spf. +* +* ipf Index of the pseudo-fermion field phi in the +* structure returned by mdflds() [mdflds.c]. +* +* isp Index of the solver parameter set that describes +* the solver to be used for the solution of the +* Dirac equation. +* +* icom The action returned by the programs setpf3() and +* action3() is summed over all MPI processes if icom=1. +* Otherwise the local part of the action is returned. +* +* status Status values returned by the solver used for the +* solution of the Dirac equation. +* +* The supported solvers are CGNE, SAP_GCR and DFL_SAP_GCR. Depending +* on the program and the solver, the number of status variables varies +* and is given by: +* +* CGNE SAP_GCR DFL_SAP_GCR +* setpf2() 1 1 3 +* force2() 1 2 6 +* action2() 1 1 3 +* +* The solver used in the case of setpf2() is for the Dirac equation with +* twisted mass mu1, while force2() and action2() use the solver for the +* equation with twisted mass mu0. Different solvers may be needed in the +* two cases if mu1>>mu0, for example. +* +* Note that, in force2(), the GCR solvers solve the Dirac equations twice. +* In these cases, the program writes the status values one after the other +* to the array. The bare quark mass m0 is the one last set by sw_parms() +* [flags/lat_parms.c] and it is taken for granted that the parameters of +* the solver have been set by set_solver_parms() [flags/solver_parms.c]. +* +* The program force2() attempts to propagate the solutions of the Dirac +* equation along the molecular-dynamics trajectories, using the field +* stack number icr (no fields are propagated if icr=0). If this feature +* is used, the program setup_chrono() [update/chrono.c] must be called +* before force2() is called for the first time. +* +* The required workspaces of double-precision spinor fields are +* +* CGNE SAP_GCR DFL_SAP_GCR +* setpf2() 1 1 1 +* force2() 2+(icr>0) 2+2*(icr>0) 2+2*(icr>0) +* action2() 1 1 1 +* +* (these figures do not include the workspace required by the solvers). +* +* The programs in this module perform global communications and must be +* called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define FORCE2_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "mdflds.h" +#include "sw_term.h" +#include "sflds.h" +#include "dirac.h" +#include "linalg.h" +#include "sap.h" +#include "dfl.h" +#include "forces.h" +#include "global.h" + + +double setpf2(double mu0,double mu1,int ipf,int isp,int icom,int *status) +{ + double act; + complex_dble z; + spinor_dble **wsd,**rsd; + spinor_dble *phi,*psi,*chi; + mdflds_t *mdfs; + solver_parms_t sp; + sap_parms_t sap; + tm_parms_t tm; + + tm=tm_parms(); + if (tm.eoflg==1) + set_tm_parms(0); + + mdfs=mdflds(); + phi=(*mdfs).pf[ipf]; + wsd=reserve_wsd(1); + psi=wsd[0]; + + random_sd(VOLUME,phi,1.0); + bnd_sd2zero(ALL_PTS,phi); + sp=solver_parms(isp); + + if (sp.solver==CGNE) + { + tmcg(sp.nmx,sp.res,mu1,phi,psi,status); + + error_root(status[0]<0,1,"setpf2 [force2.c]","CGNE solver failed " + "(mu = %.4e, parameter set no %d, status = %d)", + mu1,isp,status[0]); + + rsd=reserve_wsd(1); + chi=rsd[0]; + assign_sd2sd(VOLUME,psi,chi); + Dw_dble(-mu1,chi,psi); + mulg5_dble(VOLUME,psi); + release_wsd(); + } + else if (sp.solver==SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME,phi); + sap_gcr(sp.nkv,sp.nmx,sp.res,mu1,phi,psi,status); + mulg5_dble(VOLUME,phi); + + error_root(status[0]<0,1,"setpf2 [force2.c]","SAP_GCR solver failed " + "(mu = %.4e, parameter set no %d, status = %d)", + mu1,isp,status[0]); + } + else if (sp.solver==DFL_SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME,phi); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu1,phi,psi,status); + mulg5_dble(VOLUME,phi); + + error_root((status[0]<0)||(status[1]<0),1, + "setpf2 [force2.c]","DFL_SAP_GCR solver failed " + "(mu = %.4e, parameter set no %d, status = %d,%d,%d)", + mu1,isp,status[0],status[1],status[2]); + } + else + error_root(1,1,"setpf2 [force2.c]","Unknown solver"); + + z.re=0.0; + z.im=mu0-mu1; + mulc_spinor_add_dble(VOLUME,phi,psi,z); + act=(mu1*mu1-mu0*mu0)*norm_square_dble(VOLUME,icom,psi); + release_wsd(); + + return act; +} + + +void force2(double mu0,double mu1,int ipf,int isp,int icr, + double c,int *status) +{ + double dmu2; + + dmu2=mu1*mu1-mu0*mu0; + + force1(mu0,ipf,isp,icr,dmu2*c,status); +} + + +double action2(double mu0,double mu1,int ipf,int isp,int icom,int *status) +{ + double dmu2,act; + + dmu2=mu1*mu1-mu0*mu0; + act=dmu2*action1(mu0,ipf,isp,icom,status); + + return act; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force3.c new file mode 100644 index 0000000000000000000000000000000000000000..d39d2c7f743fdfcc02676d44edbe3873fb2cce98 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force3.c @@ -0,0 +1,674 @@ + +/******************************************************************************* +* +* File force3.c +* +* Copyright (C) 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Rational function forces. +* +* The externally accessible functions are +* +* double setpf3(int *irat,int ipf,int isw,int isp,int icom,int *status) +* Generates a pseudo-fermion field phi with probability proportional +* to exp(-Spf) and returns the action Spf+Sdet-(phi,phi) if isw=1 or +* Spf-(phi,phi) if isw!=1 (see the notes). +* +* void force3(int *irat,int ipf,int isw,int isp,double c,int *status) +* Computes the force deriving from the action Spf+Sdet if isw=1 or +* Spf if isw!=1 (see the notes). The calculated force is multiplied +* by c and added to the molecular-dynamics force field. +* +* double action3(int *irat,int ipf,int isw,int isp,int icom,int *status) +* Returns the action Spf+Sdet-(phi,phi) if isw=1 or Spf-(phi,phi) if +* isw!=1 (see the notes). +* +* Notes: +* +* Simulations including the charm and/or the strange quark are based on +* a version of the RHMC algorithm. See the notes "Charm and strange quark +* in openQCD simulations" (file doc/rhmc.pdf). +* +* The pseudo-fermion action Spf is given by +* +* Spf=(phi,P_{k,l}*phi), +* +* where P_{k,l} is the fraction of a Zolotarev rational function, which +* is defined by the parameters: +* +* irat[0] Index of the Zolotarev rational function in the +* parameter data base. +* +* irat[1] Lower end k of the selected coefficient range. +* +* irat[2] Upper end l of the selected coefficient range. +* +* See ratfcts/ratfcts.c for further explanations. The inclusion of the +* "small quark determinant" amounts to adding the action +* +* Sdet=-ln{det(1e+Doo)}+constant +* +* to the molecular-dynamics Hamilton function, where 1e is the projector +* to the quark fields that vanish on the odd lattice sites and Doo the +* odd-odd component of the Dirac operator (the constant is adjusted so +* as to reduce the significance losses when the action differences are +* computed at the end of the molecular-dynamics trajectories). +* +* The other parameters of the programs in this module are: +* +* ipf Index of the pseudo-fermion field phi in the +* structure returned by mdflds() [mdflds.c]. +* +* isp Index of the solver parameter set that describes +* the solver to be used for the solution of the +* Dirac equation. +* +* icom The action returned by the programs setpf3() and +* action3() is summed over all MPI processes if icom=1. +* Otherwise the local part of the action is returned. +* +* status Array of the average status values returned by the +* solver used for the solution of the Dirac equation +* (in the case of the DFL_SAP_GCR solver, status[2] +* and status[5] are not averaged). +* +* The supported solvers are MSCG, SAP_GCR and DFL_SAP_GCR. Depending +* on the program and the solver, the number of status variables varies +* and is given by: +* +* MSCG SAP_GCR DFL_SAP_GCR +* setpf3() 1 1 3 +* force3() 1 2 6 +* action3() 1 1 3 +* +* Note that, in force3(), the GCR solvers solve the Dirac equations twice. +* In these cases, the program writes the status values one after the other +* to the array. The bare quark mass m0 is the one last set by sw_parms() +* [flags/lat_parms.c] and it is taken for granted that the parameters of +* the solver have been set by set_solver_parms() [flags/solver_parms.c]. +* +* The required workspaces of double-precision spinor fields are +* +* MSCG SAP_GCR DFL_SAP_GCR +* setpf3() np 2 2 +* force3() np 3 3 +* action3() np 1 1 +* +* where np is the number of poles of P_{k,l} (these figures do not include +* the workspace required by the solvers). +* +* The programs in this module perform global communications and must be +* called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define FORCE3_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "mdflds.h" +#include "sw_term.h" +#include "sflds.h" +#include "dirac.h" +#include "linalg.h" +#include "sap.h" +#include "dfl.h" +#include "ratfcts.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define MAX_LEVELS 8 +#define BLK_LENGTH 8 + +static int cnt[MAX_LEVELS],nps=0; +static double smx[MAX_LEVELS],*rs; + + +static void set_res(int np,double res) +{ + int k; + + if (np>nps) + { + if (nps>0) + free(rs); + + rs=malloc(np*sizeof(*rs)); + error(rs==NULL,1,"set_res [force3.c]", + "Unable to allocate auxiliary array"); + } + + for (k=0;k1.0) + c=pow(4.0+swp.m0,-6.0); + else + c=1.0; + + for (n=0;nVOLUME) + iy=VOLUME; + + for (;ix0)||(bc==3))&&((t<(N0-1))||(bc!=0))) + { + z=det_pauli_dble(0.0,m); + + if (z.re>0.0) + p*=(c*z.re); + else + ie=1; + + z=det_pauli_dble(0.0,m+1); + + if (z.re>0.0) + p*=(c*z.re); + else + ie=1; + } + + m+=2; + } + + if (p>0.0) + { + cnt[0]+=1; + smx[0]-=log(p); + + for (n=1;(cnt[n-1]>=BLK_LENGTH)&&(n0) 2+2*(icr>0) 2+2*(icr>0) +* action4() 1 1 1 +* +* (these figures do not include the workspace required by the solvers). +* +* The programs in this module perform global communications and must be +* called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define FORCE4_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "mdflds.h" +#include "sw_term.h" +#include "sflds.h" +#include "dirac.h" +#include "linalg.h" +#include "sap.h" +#include "dfl.h" +#include "update.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define MAX_LEVELS 8 +#define BLK_LENGTH 8 + +static int cnt[MAX_LEVELS]; +static double smx[MAX_LEVELS]; + + +static double sdet(void) +{ + int bc,ix,iy,t,n,ie; + double c,p; + complex_dble z; + pauli_dble *m; + sw_parms_t swp; + + swp=sw_parms(); + + if ((4.0+swp.m0)>1.0) + c=pow(4.0+swp.m0,-6.0); + else + c=1.0; + + for (n=0;nVOLUME) + iy=VOLUME; + + for (;ix0)||(bc==3))&&((t<(N0-1))||(bc!=0))) + { + z=det_pauli_dble(0.0,m); + + if (z.re>0.0) + p*=(c*z.re); + else + ie=1; + + z=det_pauli_dble(0.0,m+1); + + if (z.re>0.0) + p*=(c*z.re); + else + ie=1; + } + + m+=2; + } + + if (p!=0.0) + { + cnt[0]+=1; + smx[0]-=2.0*log(p); + + for (n=1;(cnt[n-1]>=BLK_LENGTH)&&(nsp.res) + { + tmcgeo(sp.nmx,sp.res/res1,mu,rho,psi,status); + mulr_spinor_add_dble(VOLUME/2,chi,psi,-1.0); + } + else + status[0]=0; + } + else + tmcgeo(sp.nmx,sp.res,mu,phi,chi,status); + + release_wsd(); + } + else + tmcgeo(sp.nmx,sp.res,mu,phi,chi,status); + + error_root(status[0]<0,1,"force4 [force4.c]", + "CGNE solver failed (mu = %.4e, parameter set no %d, " + "status = %d)",mu,isp,status[0]); + + Dwoe_dble(chi,chi); + Dwoo_dble(0.0,chi,chi); + Dwhat_dble(-mu,chi,psi); + mulg5_dble(VOLUME/2,psi); + Dwoe_dble(psi,psi); + Dwoo_dble(0.0,psi,psi); + + if (icr) + add_chrono(icr,chi); + + add_prod2xt(1.0,chi,psi); + add_prod2xv(-1.0,chi,psi); + } + else if (sp.solver==SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + if (get_chrono(icr,chi)) + { + rsd=reserve_wsd(2); + rho=rsd[0]; + eta=rsd[1]; + + ifail=sw_term(ODD_PTS); + error_root(ifail!=0,1,"force4 [force4.c]", + "Inversion of the SW term was not safe"); + + Dwhat_dble(-mu,chi,psi); + mulg5_dble(VOLUME/2,psi); + Dwhat_dble(mu,psi,rho); + mulg5_dble(VOLUME/2,rho); + mulr_spinor_add_dble(VOLUME/2,rho,phi,-1.0); + + res0=norm_square_dble(VOLUME/2,1,phi); + res1=norm_square_dble(VOLUME/2,1,rho); + res1=sqrt(res1/res0); + + if (res1<1.0) + { + Dwoe_dble(chi,chi); + Dwoo_dble(0.0,chi,chi); + scale_dble(VOLUME/2,-1.0,chi+(VOLUME/2)); + + Dwoe_dble(psi,psi); + Dwoo_dble(0.0,psi,psi); + scale_dble(VOLUME/2,-1.0,psi+(VOLUME/2)); + + if (res1>sp.res) + { + mulg5_dble(VOLUME/2,rho); + set_sd2zero(VOLUME/2,rho+(VOLUME/2)); + + sap_gcr(sp.nkv,sp.nmx,sp.res/res1,mu,rho,eta,status); + + mulr_spinor_add_dble(VOLUME,psi,eta,-1.0); + + res0=norm_square_dble(VOLUME/2,1,psi); + res1=norm_square_dble(VOLUME/2,1,eta); + res1=sqrt(res1/res0); + + if (res1<1.0) + { + if (res1>sp.res) + { + mulg5_dble(VOLUME/2,eta); + set_sd2zero(VOLUME/2,eta+(VOLUME/2)); + + sap_gcr(sp.nkv,sp.nmx,sp.res/res1,-mu,eta,rho,status+1); + + mulr_spinor_add_dble(VOLUME,chi,rho,-1.0); + } + else + status[1]=0; + } + else + { + assign_sd2sd(VOLUME/2,psi,eta); + mulg5_dble(VOLUME/2,eta); + set_sd2zero(VOLUME/2,eta+(VOLUME/2)); + + sap_gcr(sp.nkv,sp.nmx,sp.res,-mu,eta,chi,status+1); + } + } + else + { + status[0]=0; + status[1]=0; + } + } + else + { + mulg5_dble(VOLUME/2,phi); + set_sd2zero(VOLUME/2,phi+(VOLUME/2)); + + sap_gcr(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + + mulg5_dble(VOLUME/2,phi); + assign_sd2sd(VOLUME/2,psi,eta); + mulg5_dble(VOLUME/2,eta); + set_sd2zero(VOLUME/2,eta+(VOLUME/2)); + + sap_gcr(sp.nkv,sp.nmx,sp.res,-mu,eta,chi,status+1); + } + + release_wsd(); + } + else + { + rsd=reserve_wsd(1); + eta=rsd[0]; + + mulg5_dble(VOLUME/2,phi); + set_sd2zero(VOLUME/2,phi+(VOLUME/2)); + + sap_gcr(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + + mulg5_dble(VOLUME/2,phi); + assign_sd2sd(VOLUME/2,psi,eta); + mulg5_dble(VOLUME/2,eta); + set_sd2zero(VOLUME/2,eta+(VOLUME/2)); + + sap_gcr(sp.nkv,sp.nmx,sp.res,-mu,eta,chi,status+1); + + release_wsd(); + } + + error_root((status[0]<0)||(status[1]<0),1,"force4 [force4.c]", + "SAP_GCR solver failed (mu = %.4e, parameter set no %d, " + "status = %d;%d)",mu,isp,status[0],status[1]); + + if (icr) + add_chrono(icr,chi); + + add_prod2xt(1.0,chi,psi); + add_prod2xv(1.0,chi,psi); + } + else if (sp.solver==DFL_SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + if (get_chrono(icr,chi)) + { + rsd=reserve_wsd(2); + rho=rsd[0]; + eta=rsd[1]; + + ifail=sw_term(ODD_PTS); + error_root(ifail!=0,1,"force4 [force4.c]", + "Inversion of the SW term was not safe"); + + Dwhat_dble(-mu,chi,psi); + mulg5_dble(VOLUME/2,psi); + Dwhat_dble(mu,psi,rho); + mulg5_dble(VOLUME/2,rho); + mulr_spinor_add_dble(VOLUME/2,rho,phi,-1.0); + + res0=norm_square_dble(VOLUME/2,1,phi); + res1=norm_square_dble(VOLUME/2,1,rho); + res1=sqrt(res1/res0); + + if (res1<1.0) + { + Dwoe_dble(chi,chi); + Dwoo_dble(0.0,chi,chi); + scale_dble(VOLUME/2,-1.0,chi+(VOLUME/2)); + + Dwoe_dble(psi,psi); + Dwoo_dble(0.0,psi,psi); + scale_dble(VOLUME/2,-1.0,psi+(VOLUME/2)); + + if (res1>sp.res) + { + mulg5_dble(VOLUME/2,rho); + set_sd2zero(VOLUME/2,rho+(VOLUME/2)); + + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res/res1,mu,rho,eta, + status); + + mulr_spinor_add_dble(VOLUME,psi,eta,-1.0); + + res0=norm_square_dble(VOLUME/2,1,psi); + res1=norm_square_dble(VOLUME/2,1,eta); + res1=sqrt(res1/res0); + + if (res1<1.0) + { + if (res1>sp.res) + { + mulg5_dble(VOLUME/2,eta); + set_sd2zero(VOLUME/2,eta+(VOLUME/2)); + + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res/res1,-mu,eta,rho, + status+3); + + mulr_spinor_add_dble(VOLUME,chi,rho,-1.0); + } + else + { + for (l=3;l<6;l++) + status[l]=0; + } + } + else + { + assign_sd2sd(VOLUME/2,psi,eta); + mulg5_dble(VOLUME/2,eta); + set_sd2zero(VOLUME/2,eta+(VOLUME/2)); + + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,-mu,eta,chi,status+3); + } + } + else + { + for (l=0;l<6;l++) + status[l]=0; + } + } + else + { + mulg5_dble(VOLUME/2,phi); + set_sd2zero(VOLUME/2,phi+(VOLUME/2)); + + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + + mulg5_dble(VOLUME/2,phi); + assign_sd2sd(VOLUME/2,psi,eta); + mulg5_dble(VOLUME/2,eta); + set_sd2zero(VOLUME/2,eta+(VOLUME/2)); + + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,-mu,eta,chi,status+3); + } + + release_wsd(); + } + else + { + rsd=reserve_wsd(1); + eta=rsd[0]; + + mulg5_dble(VOLUME/2,phi); + set_sd2zero(VOLUME/2,phi+(VOLUME/2)); + + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + + mulg5_dble(VOLUME/2,phi); + assign_sd2sd(VOLUME/2,psi,eta); + mulg5_dble(VOLUME/2,eta); + set_sd2zero(VOLUME/2,eta+(VOLUME/2)); + + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,-mu,eta,chi,status+3); + + release_wsd(); + } + + error_root((status[0]<0)||(status[1]<0)||(status[3]<0)||(status[4]<0),1, + "force4 [force4.c]","DFL_SAP_GCR solver failed " + "(mu = %.4e, parameter set no %d, status = %d,%d,%d;%d,%d,%d)", + mu,isp,status[0],status[1],status[2],status[3], + status[4],status[5]); + + if (icr) + add_chrono(icr,chi); + + add_prod2xt(1.0,chi,psi); + add_prod2xv(1.0,chi,psi); + } + else + error_root(1,1,"force4 [force4.c]","Unknown solver"); + + sw_frc(c); + hop_frc(c); + + release_wsd(); +} + + +double action4(double mu,int ipf,int isw,int isp,int icom,int *status) +{ + double act,r; + spinor_dble *phi,*chi,*psi; + spinor_dble **rsd,**wsd; + mdflds_t *mdfs; + solver_parms_t sp; + sap_parms_t sap; + tm_parms_t tm; + + tm=tm_parms(); + if (tm.eoflg!=1) + set_tm_parms(1); + + mdfs=mdflds(); + phi=(*mdfs).pf[ipf]; + sp=solver_parms(isp); + + if (isw==1) + act=sdet(); + else + act=0.0; + + if (sp.solver==CGNE) + { + rsd=reserve_wsd(1); + chi=rsd[0]; + + tmcgeo(sp.nmx,sp.res,mu,phi,chi,status); + + error_root(status[0]<0,1,"action4 [force4.c]", + "CGNE solver failed (mu = %.4e, parameter set no %d, " + "status = %d)",mu,isp,status[0]); + + wsd=reserve_wsd(1); + psi=wsd[0]; + + Dwhat_dble(-mu,chi,psi); + act+=norm_square_dble(VOLUME/2,0,psi); + + release_wsd(); + release_wsd(); + } + else if (sp.solver==SAP_GCR) + { + rsd=reserve_wsd(1); + psi=rsd[0]; + + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + mulg5_dble(VOLUME/2,phi); + set_sd2zero(VOLUME/2,phi+(VOLUME/2)); + + sap_gcr(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + + error_root(status[0]<0,1,"action4 [force4.c]", + "SAP_GCR solver failed (mu = %.4e, parameter set no %d, " + "status = %d)",mu,isp,status[0]); + + mulg5_dble(VOLUME/2,phi); + act+=norm_square_dble(VOLUME/2,0,psi); + + release_wsd(); + } + else if (sp.solver==DFL_SAP_GCR) + { + rsd=reserve_wsd(1); + psi=rsd[0]; + + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + mulg5_dble(VOLUME/2,phi); + set_sd2zero(VOLUME/2,phi+(VOLUME/2)); + + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + + error_root((status[0]<0)||(status[1]<0),1,"action4 [force4.c]", + "DFL_SAP_GCR solver failed (mu = %.4e, parameter set " + "no %d, status = %d,%d,%d)",mu,isp, + status[0],status[1],status[2]); + + mulg5_dble(VOLUME/2,phi); + act+=norm_square_dble(VOLUME/2,0,psi); + + release_wsd(); + } + else + error_root(1,1,"action4 [force4.c]","Unknown solver"); + + if (icom==1) + { + r=act; + MPI_Reduce(&r,&act,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&act,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return act; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force5.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force5.c new file mode 100644 index 0000000000000000000000000000000000000000..3d8d9f19a856709c92dcb198e0f8d6be81380be2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force5.c @@ -0,0 +1,220 @@ + +/******************************************************************************* +* +* File force5.c +* +* Copyright (C) 2011-2013 Stefan Schaefer, Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Hasenbusch twisted mass pseudo-fermion action and force with even-odd +* precconditioning. +* +* The externally accessible functions are +* +* double setpf5(double mu0,double mu1,int ipf,int isp,int icom, +* int *status) +* Generates a pseudo-fermion field phi with probability proportional +* to exp(-Spf) and returns the action Spf-(phi,phi) (see the notes). +* +* void force5(double mu0,int mu1,int ipf,int isp,int icr,double c, +* int *status) +* Computes the force deriving from the action Spf (see the notes). +* The calculated force is multiplied by c and added to the molecular- +* dynamics force field. +* +* double action5(double mu0,double mu1,int ipf,int isp,int icom, +* int *status) +* Returns the action Spf-(phi,phi) (see the notes). +* +* Notes: +* +* The pseudo-fermion action Spf is given by +* +* Spf=(phi,(Dwhat^dag*Dwhat+mu1^2)(Dwhat^dag*Dwhat+mu0^2)^(-1)*phi) +* +* =(phi,phi)+(mu1^2-mu0^2)*(phi,(Dwhat^dag*Dwhat+mu0^2)^(-1)*phi) +* +* where Dwhat denotes the even-odd preconditioned (improved) Wilson-Dirac +* operator and phi the pseudo-fermion field. The latter vanishes on the +* odd lattice sites. +* +* The common parameters of the programs in this module are: +* +* mu0,mu1 Twisted mass parameters in Spf. +* +* ipf Index of the pseudo-fermion field phi in the +* structure returned by mdflds() [mdflds.c]. +* +* isp Index of the solver parameter set that describes +* the solver to be used for the solution of the +* Dirac equation. +* +* icom The action returned by the programs setpf3() and +* action3() is summed over all MPI processes if icom=1. +* Otherwise the local part of the action is returned. +* +* status Status values returned by the solver used for the +* solution of the Dirac equation. +* +* The supported solvers are CGNE, SAP_GCR and DFL_SAP_GCR. Depending +* on the program and the solver, the number of status variables varies +* and is given by: +* +* CGNE SAP_GCR DFL_SAP_GCR +* setpf5() 1 1 3 +* force5() 1 2 6 +* action5() 1 1 3 +* +* The solver used in the case of setpf5() is for the Dirac equation with +* twisted mass mu1, while force5() and action5() use the solver for the +* equation with twisted mass mu0. Different solvers may be needed in the +* two cases if mu1>>mu0, for example. +* +* Note that, in force5(), the GCR solvers solve the Dirac equations twice. +* In these cases, the program writes the status values one after the other +* to the array. The bare quark mass m0 is the one last set by sw_parms() +* [flags/lat_parms.c] and it is taken for granted that the parameters of +* the solver have been set by set_solver_parms() [flags/solver_parms.c]. +* +* The program force5() attempts to propagate the solutions of the Dirac +* equation along the molecular-dynamics trajectories, using the field +* stack number icr (no fields are propagated if icr=0). If this feature +* is used, the program setup_chrono() [update/chrono.c] must be called +* before force5() is called for the first time. +* +* The required workspaces of double-precision spinor fields are +* +* CGNE SAP_GCR DFL_SAP_GCR +* setpf5() 1 1 1 +* force5() 2+(icr>0) 2+2*(icr>0) 2+2*(icr>0) +* action5() 1 1 1 +* +* (these figures do not include the workspace required by the solvers). +* +* The programs in this module perform global communications and must be +* called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define FORCE5_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "mdflds.h" +#include "sw_term.h" +#include "sflds.h" +#include "dirac.h" +#include "linalg.h" +#include "sap.h" +#include "dfl.h" +#include "forces.h" +#include "global.h" + + +double setpf5(double mu0,double mu1,int ipf,int isp,int icom,int *status) +{ + double act; + complex_dble z; + spinor_dble **wsd,**rsd; + spinor_dble *phi,*psi,*chi; + mdflds_t *mdfs; + solver_parms_t sp; + sap_parms_t sap; + tm_parms_t tm; + + tm=tm_parms(); + if (tm.eoflg!=1) + set_tm_parms(1); + + mdfs=mdflds(); + phi=(*mdfs).pf[ipf]; + wsd=reserve_wsd(1); + psi=wsd[0]; + + random_sd(VOLUME/2,phi,1.0); + set_sd2zero(VOLUME/2,phi+(VOLUME/2)); + bnd_sd2zero(EVEN_PTS,phi); + sp=solver_parms(isp); + + if (sp.solver==CGNE) + { + tmcgeo(sp.nmx,sp.res,mu1,phi,psi,status); + + error_root(status[0]<0,1,"setpf5 [force5.c]","CGNE solver failed " + "(mu = %.4e, parameter set no %d, status = %d)", + mu1,isp,status[0]); + + rsd=reserve_wsd(1); + chi=rsd[0]; + assign_sd2sd(VOLUME/2,psi,chi); + Dwhat_dble(-mu1,chi,psi); + mulg5_dble(VOLUME/2,psi); + release_wsd(); + } + else if (sp.solver==SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME/2,phi); + sap_gcr(sp.nkv,sp.nmx,sp.res,mu1,phi,psi,status); + mulg5_dble(VOLUME/2,phi); + + error_root(status[0]<0,1,"setpf5 [force5.c]","SAP_GCR solver failed " + "(mu = %.4e, parameter set no %d, status = %d)", + mu1,isp,status[0]); + } + else if (sp.solver==DFL_SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME/2,phi); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu1,phi,psi,status); + mulg5_dble(VOLUME/2,phi); + + error_root((status[0]<0)||(status[1]<0),1,"setpf5 [force5.c]", + "DFL_SAP_GCR solver failed (mu = %.4e, parameter set " + "no %d, status = %d,%d,%d)",mu1,isp, + status[0],status[1],status[2]); + } + else + error_root(1,1,"setpf5 [force5.c]","Unknown solver"); + + z.re=0.0; + z.im=mu0-mu1; + mulc_spinor_add_dble(VOLUME/2,phi,psi,z); + act=(mu1*mu1-mu0*mu0)*norm_square_dble(VOLUME/2,icom,psi); + release_wsd(); + + return act; +} + + +void force5(double mu0,double mu1,int ipf,int isp,int icr, + double c,int *status) +{ + double dmu2; + + dmu2=mu1*mu1-mu0*mu0; + force4(mu0,ipf,0,isp,icr,dmu2*c,status); +} + + +double action5(double mu0,double mu1,int ipf,int isp,int icom,int *status) +{ + double dmu2,act; + + dmu2=mu1*mu1-mu0*mu0; + act=dmu2*action4(mu0,ipf,0,isp,icom,status); + + return act; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/frcfcts.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/frcfcts.c new file mode 100644 index 0000000000000000000000000000000000000000..3539839592927473fda98d190ea9caa0a7b188bb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/frcfcts.c @@ -0,0 +1,688 @@ + +/******************************************************************************* +* +* File frcfcts.c +* +* Copyright (C) 2005, 2011, 2012 Martin Luescher, Stefan Schaefer +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Generic functions used for the force calculation. +* +* The externally accessible functions are +* +* void det2xt(pauli_dble *m,u3_alg_dble *X) +* Computes the matrices X[0],..,X[5] associated to the SW term on a +* given lattice point (see the notes). The program expects that m[0] +* and m[1] contain the hermitian part of the inverse of the SW term +* at the chosen point. +* +* void prod2xt(spinor_dble *r,spinor_dble *s,u3_alg_dble *X) +* Computes the matrices X[0],..,X[5] associated to a pair of spinors +* r and s at a given lattice point (see the notes). +* +* The following is an array of functions indexed by the direction mu=0,..,3: +* +* void (*prod2xv[])(spinor_dble *rx,spinor_dble *ry, +* spinor_dble *sx,spinor_dble *sy,su3_dble *u) +* Computes the complex 3x3 matrix +* +* u=tr{gamma_5*(1-gamma_mu)*[(sy x rx^dag)+(ry x sx^dag)]} +* +* where ..x.. denotes the tensor product in spinor space and the trace +* is taken over the Dirac indices. +* +* Notes: +* +* As discussed in the notes +* +* M. Luescher: "Molecular-dynamics quark forces" (January 2012) +* +* the programs in this module serve to compute the spin part of the quark +* forces. The data type u3_alg_dble is described at the top of the module +* su3fcts/su3prod.c. +* +* The matrices computed by the program det2xt() are +* +* X[n]=i*tr{sigma_{mu,nu}*diag(m[0],m[1])} +* +* where (mu,nu)=(0,1),(0,2),(0,3),(2,3),(3,1),(1,2) for n=0,..,5. Similarly, +* the program prod2xt() computes +* +* X[n]=i*tr{(gamma_5*sigma_{mu,nu}*s) x (r^dag)+(s<->r)} +* +* where ..x.. denotes the tensor product in spinor space. In both cases, +* the trace is taken over the Dirac indices only. +* +*******************************************************************************/ + +#define FRCFCTS_C + +#include +#include +#include +#include "su3.h" +#include "forces.h" + +static su3_vector_dble psi1,psi2,chi1,chi2 ALIGNED16; +static pauli_dble ms[2] ALIGNED16; + +typedef union +{ + spinor_dble s; + weyl_dble w[2]; +} spin_t; + +#define _re(z,w) ((z).re*(w).re+(z).im*(w).im) +#define _im(z,w) ((z).im*(w).re-(z).re*(w).im) + + +void det2xt(pauli_dble *m,u3_alg_dble *X) +{ + double x,*up,*um; + u3_alg_dble *X0,*X1; + + up=m[0].u; + um=m[1].u; + + X0=X; + X1=X+3; + + x=up[10]+up[10]; + (*X0).c1=x; + (*X1).c1=-x; + x=um[10]+um[10]; + (*X0).c1-=x; + (*X1).c1-=x; + + x=up[20]+up[20]; + (*X0).c2=x; + (*X1).c2=-x; + x=um[20]+um[20]; + (*X0).c2-=x; + (*X1).c2-=x; + + x=up[28]+up[28]; + (*X0).c3=x; + (*X1).c3=-x; + x=um[28]+um[28]; + (*X0).c3-=x; + (*X1).c3-=x; + + x=up[19]-up[13]; + (*X0).c4=x; + (*X1).c4=-x; + x=um[19]-um[13]; + (*X0).c4-=x; + (*X1).c4-=x; + + x=up[12]+up[18]; + (*X0).c5=x; + (*X1).c5=-x; + x=um[12]+um[18]; + (*X0).c5-=x; + (*X1).c5-=x; + + x=up[25]-up[15]; + (*X0).c6=x; + (*X1).c6=-x; + x=um[25]-um[15]; + (*X0).c6-=x; + (*X1).c6-=x; + + x=up[14]+up[24]; + (*X0).c7=x; + (*X1).c7=-x; + x=um[14]+um[24]; + (*X0).c7-=x; + (*X1).c7-=x; + + x=up[27]-up[23]; + (*X0).c8=x; + (*X1).c8=-x; + x=um[27]-um[23]; + (*X0).c8-=x; + (*X1).c8-=x; + + x=up[22]+up[26]; + (*X0).c9=x; + (*X1).c9=-x; + x=um[22]+um[26]; + (*X0).c9-=x; + (*X1).c9-=x; + + X0=X+1; + X1=X+4; + + x=up[11]+up[11]; + (*X0).c1=-x; + (*X1).c1=x; + x=um[11]+um[11]; + (*X0).c1+=x; + (*X1).c1+=x; + + x=up[21]+up[21]; + (*X0).c2=-x; + (*X1).c2=x; + x=um[21]+um[21]; + (*X0).c2+=x; + (*X1).c2+=x; + + x=up[29]+up[29]; + (*X0).c3=-x; + (*X1).c3=x; + x=um[29]+um[29]; + (*X0).c3+=x; + (*X1).c3+=x; + + x=up[18]-up[12]; + (*X0).c4=x; + (*X1).c4=-x; + x=um[18]-um[12]; + (*X0).c4-=x; + (*X1).c4-=x; + + x=up[13]+up[19]; + (*X0).c5=-x; + (*X1).c5=x; + x=um[13]+um[19]; + (*X0).c5+=x; + (*X1).c5+=x; + + x=up[24]-up[14]; + (*X0).c6=x; + (*X1).c6=-x; + x=um[24]-um[14]; + (*X0).c6-=x; + (*X1).c6-=x; + + x=up[25]+up[15]; + (*X0).c7=-x; + (*X1).c7=x; + x=um[25]+um[15]; + (*X0).c7+=x; + (*X1).c7+=x; + + x=up[26]-up[22]; + (*X0).c8=x; + (*X1).c8=-x; + x=um[26]-um[22]; + (*X0).c8-=x; + (*X1).c8-=x; + + x=up[27]+up[23]; + (*X0).c9=-x; + (*X1).c9=x; + x=um[27]+um[23]; + (*X0).c9+=x; + (*X1).c9+=x; + + X0=X+2; + X1=X+5; + + x=up[0]-up[3]; + (*X0).c1=x; + (*X1).c1=-x; + x=um[0]-um[3]; + (*X0).c1-=x; + (*X1).c1-=x; + + x=up[1]-up[4]; + (*X0).c2=x; + (*X1).c2=-x; + x=um[1]-um[4]; + (*X0).c2-=x; + (*X1).c2-=x; + + x=up[2]-up[5]; + (*X0).c3=x; + (*X1).c3=-x; + x=um[2]-um[5]; + (*X0).c3-=x; + (*X1).c3-=x; + + x=up[31]-up[7]; + (*X0).c4=x; + (*X1).c4=-x; + x=um[31]-um[7]; + (*X0).c4-=x; + (*X1).c4-=x; + + x=up[6]-up[30]; + (*X0).c5=x; + (*X1).c5=-x; + x=um[6]-um[30]; + (*X0).c5-=x; + (*X1).c5-=x; + + x=up[33]-up[9]; + (*X0).c6=x; + (*X1).c6=-x; + x=um[33]-um[9]; + (*X0).c6-=x; + (*X1).c6-=x; + + x=up[8]-up[32]; + (*X0).c7=x; + (*X1).c7=-x; + x=um[8]-um[32]; + (*X0).c7-=x; + (*X1).c7-=x; + + x=up[35]-up[17]; + (*X0).c8=x; + (*X1).c8=-x; + x=um[35]-um[17]; + (*X0).c8-=x; + (*X1).c8-=x; + + x=up[16]-up[34]; + (*X0).c9=x; + (*X1).c9=-x; + x=um[16]-um[34]; + (*X0).c9-=x; + (*X1).c9-=x; +} + + +static void det2xt5(pauli_dble *m,u3_alg_dble *X) +{ + double x,*up,*um; + u3_alg_dble *X0,*X1; + + up=m[0].u; + um=m[1].u; + + X0=X; + X1=X+3; + + x=up[10]+up[10]; + (*X0).c1=x; + (*X1).c1=-x; + x=um[10]+um[10]; + (*X0).c1+=x; + (*X1).c1+=x; + + x=up[20]+up[20]; + (*X0).c2=x; + (*X1).c2=-x; + x=um[20]+um[20]; + (*X0).c2+=x; + (*X1).c2+=x; + + x=up[28]+up[28]; + (*X0).c3=x; + (*X1).c3=-x; + x=um[28]+um[28]; + (*X0).c3+=x; + (*X1).c3+=x; + + x=up[19]-up[13]; + (*X0).c4=x; + (*X1).c4=-x; + x=um[19]-um[13]; + (*X0).c4+=x; + (*X1).c4+=x; + + x=up[12]+up[18]; + (*X0).c5=x; + (*X1).c5=-x; + x=um[12]+um[18]; + (*X0).c5+=x; + (*X1).c5+=x; + + x=up[25]-up[15]; + (*X0).c6=x; + (*X1).c6=-x; + x=um[25]-um[15]; + (*X0).c6+=x; + (*X1).c6+=x; + + x=up[14]+up[24]; + (*X0).c7=x; + (*X1).c7=-x; + x=um[14]+um[24]; + (*X0).c7+=x; + (*X1).c7+=x; + + x=up[27]-up[23]; + (*X0).c8=x; + (*X1).c8=-x; + x=um[27]-um[23]; + (*X0).c8+=x; + (*X1).c8+=x; + + x=up[22]+up[26]; + (*X0).c9=x; + (*X1).c9=-x; + x=um[22]+um[26]; + (*X0).c9+=x; + (*X1).c9+=x; + + X0=X+1; + X1=X+4; + + x=up[11]+up[11]; + (*X0).c1=-x; + (*X1).c1=x; + x=um[11]+um[11]; + (*X0).c1-=x; + (*X1).c1-=x; + + x=up[21]+up[21]; + (*X0).c2=-x; + (*X1).c2=x; + x=um[21]+um[21]; + (*X0).c2-=x; + (*X1).c2-=x; + + x=up[29]+up[29]; + (*X0).c3=-x; + (*X1).c3=x; + x=um[29]+um[29]; + (*X0).c3-=x; + (*X1).c3-=x; + + x=up[18]-up[12]; + (*X0).c4=x; + (*X1).c4=-x; + x=um[18]-um[12]; + (*X0).c4+=x; + (*X1).c4+=x; + + x=up[13]+up[19]; + (*X0).c5=-x; + (*X1).c5=x; + x=um[13]+um[19]; + (*X0).c5-=x; + (*X1).c5-=x; + + x=up[24]-up[14]; + (*X0).c6=x; + (*X1).c6=-x; + x=um[24]-um[14]; + (*X0).c6+=x; + (*X1).c6+=x; + + x=up[25]+up[15]; + (*X0).c7=-x; + (*X1).c7=x; + x=um[25]+um[15]; + (*X0).c7-=x; + (*X1).c7-=x; + + x=up[26]-up[22]; + (*X0).c8=x; + (*X1).c8=-x; + x=um[26]-um[22]; + (*X0).c8+=x; + (*X1).c8+=x; + + x=up[27]+up[23]; + (*X0).c9=-x; + (*X1).c9=x; + x=um[27]+um[23]; + (*X0).c9-=x; + (*X1).c9-=x; + + X0=X+2; + X1=X+5; + + x=up[0]-up[3]; + (*X0).c1=x; + (*X1).c1=-x; + x=um[0]-um[3]; + (*X0).c1+=x; + (*X1).c1+=x; + + x=up[1]-up[4]; + (*X0).c2=x; + (*X1).c2=-x; + x=um[1]-um[4]; + (*X0).c2+=x; + (*X1).c2+=x; + + x=up[2]-up[5]; + (*X0).c3=x; + (*X1).c3=-x; + x=um[2]-um[5]; + (*X0).c3+=x; + (*X1).c3+=x; + + x=up[31]-up[7]; + (*X0).c4=x; + (*X1).c4=-x; + x=um[31]-um[7]; + (*X0).c4+=x; + (*X1).c4+=x; + + x=up[6]-up[30]; + (*X0).c5=x; + (*X1).c5=-x; + x=um[6]-um[30]; + (*X0).c5+=x; + (*X1).c5+=x; + + x=up[33]-up[9]; + (*X0).c6=x; + (*X1).c6=-x; + x=um[33]-um[9]; + (*X0).c6+=x; + (*X1).c6+=x; + + x=up[8]-up[32]; + (*X0).c7=x; + (*X1).c7=-x; + x=um[8]-um[32]; + (*X0).c7+=x; + (*X1).c7+=x; + + x=up[35]-up[17]; + (*X0).c8=x; + (*X1).c8=-x; + x=um[35]-um[17]; + (*X0).c8+=x; + (*X1).c8+=x; + + x=up[16]-up[34]; + (*X0).c9=x; + (*X1).c9=-x; + x=um[16]-um[34]; + (*X0).c9+=x; + (*X1).c9+=x; +} + + +static void vec2pauli(weyl_dble *r,weyl_dble *s,pauli_dble *m) +{ + double *u; + su3_vector_dble *r1,*r2,*s1,*s2; + + u=(*m).u; + r1=&((*r).c1); + r2=&((*r).c2); + s1=&((*s).c1); + s2=&((*s).c2); + + u[ 0]=_re((*s1).c1,(*r1).c1)+_re((*s1).c1,(*r1).c1); + u[ 1]=_re((*s1).c2,(*r1).c2)+_re((*s1).c2,(*r1).c2); + u[ 2]=_re((*s1).c3,(*r1).c3)+_re((*s1).c3,(*r1).c3); + + u[ 3]=_re((*s2).c1,(*r2).c1)+_re((*s2).c1,(*r2).c1); + u[ 4]=_re((*s2).c2,(*r2).c2)+_re((*s2).c2,(*r2).c2); + u[ 5]=_re((*s2).c3,(*r2).c3)+_re((*s2).c3,(*r2).c3); + + u[ 6]=_re((*s1).c1,(*r1).c2)+_re((*r1).c1,(*s1).c2); + u[ 7]=_im((*s1).c1,(*r1).c2)+_im((*r1).c1,(*s1).c2); + u[ 8]=_re((*s1).c1,(*r1).c3)+_re((*r1).c1,(*s1).c3); + u[ 9]=_im((*s1).c1,(*r1).c3)+_im((*r1).c1,(*s1).c3); + + u[10]=_re((*s1).c1,(*r2).c1)+_re((*r1).c1,(*s2).c1); + u[11]=_im((*s1).c1,(*r2).c1)+_im((*r1).c1,(*s2).c1); + u[12]=_re((*s1).c1,(*r2).c2)+_re((*r1).c1,(*s2).c2); + u[13]=_im((*s1).c1,(*r2).c2)+_im((*r1).c1,(*s2).c2); + u[14]=_re((*s1).c1,(*r2).c3)+_re((*r1).c1,(*s2).c3); + u[15]=_im((*s1).c1,(*r2).c3)+_im((*r1).c1,(*s2).c3); + + u[16]=_re((*s1).c2,(*r1).c3)+_re((*r1).c2,(*s1).c3); + u[17]=_im((*s1).c2,(*r1).c3)+_im((*r1).c2,(*s1).c3); + + u[18]=_re((*s1).c2,(*r2).c1)+_re((*r1).c2,(*s2).c1); + u[19]=_im((*s1).c2,(*r2).c1)+_im((*r1).c2,(*s2).c1); + u[20]=_re((*s1).c2,(*r2).c2)+_re((*r1).c2,(*s2).c2); + u[21]=_im((*s1).c2,(*r2).c2)+_im((*r1).c2,(*s2).c2); + u[22]=_re((*s1).c2,(*r2).c3)+_re((*r1).c2,(*s2).c3); + u[23]=_im((*s1).c2,(*r2).c3)+_im((*r1).c2,(*s2).c3); + + u[24]=_re((*s1).c3,(*r2).c1)+_re((*r1).c3,(*s2).c1); + u[25]=_im((*s1).c3,(*r2).c1)+_im((*r1).c3,(*s2).c1); + u[26]=_re((*s1).c3,(*r2).c2)+_re((*r1).c3,(*s2).c2); + u[27]=_im((*s1).c3,(*r2).c2)+_im((*r1).c3,(*s2).c2); + u[28]=_re((*s1).c3,(*r2).c3)+_re((*r1).c3,(*s2).c3); + u[29]=_im((*s1).c3,(*r2).c3)+_im((*r1).c3,(*s2).c3); + + u[30]=_re((*s2).c1,(*r2).c2)+_re((*r2).c1,(*s2).c2); + u[31]=_im((*s2).c1,(*r2).c2)+_im((*r2).c1,(*s2).c2); + u[32]=_re((*s2).c1,(*r2).c3)+_re((*r2).c1,(*s2).c3); + u[33]=_im((*s2).c1,(*r2).c3)+_im((*r2).c1,(*s2).c3); + u[34]=_re((*s2).c2,(*r2).c3)+_re((*r2).c2,(*s2).c3); + u[35]=_im((*s2).c2,(*r2).c3)+_im((*r2).c2,(*s2).c3); +} + + +void prod2xt(spinor_dble *r,spinor_dble *s,u3_alg_dble *X) +{ + spin_t *spr,*sps; + + spr=(spin_t*)(r); + sps=(spin_t*)(s); + + vec2pauli((*spr).w,(*sps).w,ms); + vec2pauli((*spr).w+1,(*sps).w+1,ms+1); + + det2xt5(ms,X); +} + + +static void set2mat(su3_dble *u) +{ + (*u).c11.re=_re(psi1.c1,chi1.c1)+_re(psi2.c1,chi2.c1); + (*u).c11.im=_im(psi1.c1,chi1.c1)+_im(psi2.c1,chi2.c1); + (*u).c12.re=_re(psi1.c1,chi1.c2)+_re(psi2.c1,chi2.c2); + (*u).c12.im=_im(psi1.c1,chi1.c2)+_im(psi2.c1,chi2.c2); + (*u).c13.re=_re(psi1.c1,chi1.c3)+_re(psi2.c1,chi2.c3); + (*u).c13.im=_im(psi1.c1,chi1.c3)+_im(psi2.c1,chi2.c3); + + (*u).c21.re=_re(psi1.c2,chi1.c1)+_re(psi2.c2,chi2.c1); + (*u).c21.im=_im(psi1.c2,chi1.c1)+_im(psi2.c2,chi2.c1); + (*u).c22.re=_re(psi1.c2,chi1.c2)+_re(psi2.c2,chi2.c2); + (*u).c22.im=_im(psi1.c2,chi1.c2)+_im(psi2.c2,chi2.c2); + (*u).c23.re=_re(psi1.c2,chi1.c3)+_re(psi2.c2,chi2.c3); + (*u).c23.im=_im(psi1.c2,chi1.c3)+_im(psi2.c2,chi2.c3); + + (*u).c31.re=_re(psi1.c3,chi1.c1)+_re(psi2.c3,chi2.c1); + (*u).c31.im=_im(psi1.c3,chi1.c1)+_im(psi2.c3,chi2.c1); + (*u).c32.re=_re(psi1.c3,chi1.c2)+_re(psi2.c3,chi2.c2); + (*u).c32.im=_im(psi1.c3,chi1.c2)+_im(psi2.c3,chi2.c2); + (*u).c33.re=_re(psi1.c3,chi1.c3)+_re(psi2.c3,chi2.c3); + (*u).c33.im=_im(psi1.c3,chi1.c3)+_im(psi2.c3,chi2.c3); +} + + +static void add2mat(su3_dble *u) +{ + (*u).c11.re+=_re(psi1.c1,chi1.c1)+_re(psi2.c1,chi2.c1); + (*u).c11.im+=_im(psi1.c1,chi1.c1)+_im(psi2.c1,chi2.c1); + (*u).c12.re+=_re(psi1.c1,chi1.c2)+_re(psi2.c1,chi2.c2); + (*u).c12.im+=_im(psi1.c1,chi1.c2)+_im(psi2.c1,chi2.c2); + (*u).c13.re+=_re(psi1.c1,chi1.c3)+_re(psi2.c1,chi2.c3); + (*u).c13.im+=_im(psi1.c1,chi1.c3)+_im(psi2.c1,chi2.c3); + + (*u).c21.re+=_re(psi1.c2,chi1.c1)+_re(psi2.c2,chi2.c1); + (*u).c21.im+=_im(psi1.c2,chi1.c1)+_im(psi2.c2,chi2.c1); + (*u).c22.re+=_re(psi1.c2,chi1.c2)+_re(psi2.c2,chi2.c2); + (*u).c22.im+=_im(psi1.c2,chi1.c2)+_im(psi2.c2,chi2.c2); + (*u).c23.re+=_re(psi1.c2,chi1.c3)+_re(psi2.c2,chi2.c3); + (*u).c23.im+=_im(psi1.c2,chi1.c3)+_im(psi2.c2,chi2.c3); + + (*u).c31.re+=_re(psi1.c3,chi1.c1)+_re(psi2.c3,chi2.c1); + (*u).c31.im+=_im(psi1.c3,chi1.c1)+_im(psi2.c3,chi2.c1); + (*u).c32.re+=_re(psi1.c3,chi1.c2)+_re(psi2.c3,chi2.c2); + (*u).c32.im+=_im(psi1.c3,chi1.c2)+_im(psi2.c3,chi2.c2); + (*u).c33.re+=_re(psi1.c3,chi1.c3)+_re(psi2.c3,chi2.c3); + (*u).c33.im+=_im(psi1.c3,chi1.c3)+_im(psi2.c3,chi2.c3); +} + + +static void prod2xv0(spinor_dble *rx,spinor_dble *ry, + spinor_dble *sx,spinor_dble *sy,su3_dble *u) +{ + _vector_add(psi1,(*ry).c1,(*ry).c3); + _vector_add(psi2,(*ry).c2,(*ry).c4); + _vector_sub(chi1,(*sx).c1,(*sx).c3); + _vector_sub(chi2,(*sx).c2,(*sx).c4); + set2mat(u); + + _vector_add(psi1,(*sy).c1,(*sy).c3); + _vector_add(psi2,(*sy).c2,(*sy).c4); + _vector_sub(chi1,(*rx).c1,(*rx).c3); + _vector_sub(chi2,(*rx).c2,(*rx).c4); + add2mat(u); +} + + +static void prod2xv1(spinor_dble *rx,spinor_dble *ry, + spinor_dble *sx,spinor_dble *sy,su3_dble *u) +{ + _vector_i_add(psi1,(*ry).c1,(*ry).c4); + _vector_i_add(psi2,(*ry).c2,(*ry).c3); + _vector_i_sub(chi1,(*sx).c1,(*sx).c4); + _vector_i_sub(chi2,(*sx).c2,(*sx).c3); + set2mat(u); + + _vector_i_add(psi1,(*sy).c1,(*sy).c4); + _vector_i_add(psi2,(*sy).c2,(*sy).c3); + _vector_i_sub(chi1,(*rx).c1,(*rx).c4); + _vector_i_sub(chi2,(*rx).c2,(*rx).c3); + add2mat(u); +} + + +static void prod2xv2(spinor_dble *rx,spinor_dble *ry, + spinor_dble *sx,spinor_dble *sy,su3_dble *u) +{ + _vector_add(psi1,(*ry).c1,(*ry).c4); + _vector_sub(psi2,(*ry).c2,(*ry).c3); + _vector_sub(chi1,(*sx).c1,(*sx).c4); + _vector_add(chi2,(*sx).c2,(*sx).c3); + set2mat(u); + + _vector_add(psi1,(*sy).c1,(*sy).c4); + _vector_sub(psi2,(*sy).c2,(*sy).c3); + _vector_sub(chi1,(*rx).c1,(*rx).c4); + _vector_add(chi2,(*rx).c2,(*rx).c3); + add2mat(u); +} + + +static void prod2xv3(spinor_dble *rx,spinor_dble *ry, + spinor_dble *sx,spinor_dble *sy,su3_dble *u) +{ + _vector_i_add(psi1,(*ry).c1,(*ry).c3); + _vector_i_sub(psi2,(*ry).c2,(*ry).c4); + _vector_i_sub(chi1,(*sx).c1,(*sx).c3); + _vector_i_add(chi2,(*sx).c2,(*sx).c4); + set2mat(u); + + _vector_i_add(psi1,(*sy).c1,(*sy).c3); + _vector_i_sub(psi2,(*sy).c2,(*sy).c4); + _vector_i_sub(chi1,(*rx).c1,(*rx).c3); + _vector_i_add(chi2,(*rx).c2,(*rx).c4); + add2mat(u); +} + + +void (*prod2xv[4])(spinor_dble *rx,spinor_dble *ry, + spinor_dble *sx,spinor_dble *sy,su3_dble *u)= +{prod2xv0,prod2xv1,prod2xv2,prod2xv3}; diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/genfrc.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/genfrc.c new file mode 100644 index 0000000000000000000000000000000000000000..3fc357614864a327328158591e6e2169e457d36e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/genfrc.c @@ -0,0 +1,195 @@ +/******************************************************************************* +* +* File genfrc.c +* +* Copyright (C) 2006, 2011, 2013 Martin Luescher, Stefan Schaefer +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Calculation of quark forces. +* +* The externally accessible functions are +* +* void sw_frc(double c) +* Computes the SW part of the quark force, using the current value +* of the X tensor field (see the notes). The calculated force is then +* multiplied by c and added to the MD force field. +* +* void hop_frc(double c) +* Computes the hopping part of the quark force, using the current +* value of the X vector field (see the notes). The calculated force +* is then multiplied by c and added to the MD force field. +* +* Notes: +* +* The computation of the quark forces is described in the notes +* +* M. Luescher: "Molecular-dynamics quark forces" (January 2012) +* +* For explanations of the X tensor and vector fields, see xtensor.c and +* frcfcts.c. The MD force field is the one returned by the program mdflds() +* (see mdflds/mdflds.c). +* +* If the X tensor field is obtained from the SW term calculated by sw_term(), +* and if the X vector field is obtained from quark fields vanishing at the +* boundaries of the lattice, as required by the chosen boundary conditions, +* the programs sw_frc() and hop_frc() leave the force field on the static +* links unchanged. +* +* The coefficient csw of the SW term is retrieved from the parameter data +* base (flags/lat_parms.c). The programs in this module perform global +* operations and must be called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define GENFRC_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "su3fcts.h" +#include "flags.h" +#include "lattice.h" +#include "mdflds.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "tcharge.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +static su3_dble w[6] ALIGNED16; +static su3_alg_dble Y[8] ALIGNED16; + + +void sw_frc(double c) +{ + int bc,n,ix,t; + int ipu[4],ipx[4]; + u3_alg_dble **xt,*Xb; + su3_alg_dble *fb,*fr; + su3_dble *ub; + mdflds_t *mdfs; + sw_parms_t swp; + + bc=bc_type(); + swp=sw_parms(); + c*=0.0625*swp.csw; + + mdfs=mdflds(); + fb=(*mdfs).frc; + set_alg2zero(7*(BNDRY/4),fb+4*VOLUME); + + if (query_flags(UDBUF_UP2DATE)!=1) + copy_bnd_ud(); + ub=udfld(); + xt=xtensor(); + + for (n=0;n<6;n++) + { + Xb=xt[n]; + copy_bnd_ft(n,Xb); + + for (ix=0;ix=3)||(bc==0)||(bc==3)) + { + fr=fb+ipu[1]; + _su3_alg_mul_add_assign(*fr,c,Y[6]); + fr=fb+ipu[2]; + _su3_alg_mul_sub_assign(*fr,c,Y[1]); + } + else + { + if (t<(N0-1)) + { + fr=fb+ipu[1]; + _su3_alg_mul_add_assign(*fr,c,Y[6]); + } + + if ((t>0)||(bc==2)) + { + fr=fb+ipu[2]; + _su3_alg_mul_sub_assign(*fr,c,Y[1]); + } + } + + fr=fb+ipu[3]; + _su3_alg_mul_sub_assign(*fr,c,Y[3]); + } + } + + add_bnd_frc(); +} + + +void hop_frc(double c) +{ + su3_alg_dble *fr; + su3_dble *xv,*u,*um; + mdflds_t *mdfs; + + xv=xvector(); + mdfs=mdflds(); + fr=(*mdfs).frc; + + u=udfld(); + um=u+4*VOLUME; + c*=-0.5; + + for (;u=-1. Otherwise the field psi is set to zero and the +* programs return the norm of the source eta. +* +* The SW term is recalculated when needed. Evidently the solver is a global +* program that must be called on all processes simultaneously. The required +* workspaces are +* +* spinor 5 +* spinor_dble 3 +* +* (see utils/wspace.c). +* +*******************************************************************************/ + +#define TMCG_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "linsolv.h" +#include "forces.h" +#include "global.h" + +static float mus; +static double mud; + + +static void Dop(spinor *s,spinor *r) +{ + Dw(mus,s,r); + mulg5(VOLUME,r); + mus=-mus; +} + + +static void Dop_dble(spinor_dble *s,spinor_dble *r) +{ + Dw_dble(mud,s,r); + mulg5_dble(VOLUME,r); + mud=-mud; +} + + +double tmcg(int nmx,double res,double mu, + spinor_dble *eta,spinor_dble *psi,int *status) +{ + double rho,rho0,fact; + spinor **ws; + spinor_dble **rsd,**wsd; + tm_parms_t tm; + + tm=tm_parms(); + if (tm.eoflg==1) + set_tm_parms(0); + + if (query_flags(U_MATCH_UD)!=1) + assign_ud2u(); + + sw_term(NO_PTS); + + if ((query_flags(SW_UP2DATE)!=1)|| + (query_flags(SW_E_INVERTED)!=0)||(query_flags(SW_O_INVERTED)!=0)) + assign_swd2sw(); + + ws=reserve_ws(5); + wsd=reserve_wsd(2); + rsd=reserve_wsd(1); + + mus=(float)(mu); + mud=mu; + rho0=sqrt(norm_square_dble(VOLUME,1,eta)); + fact=rho0/sqrt((double)(VOLUME)*(double)(24*NPROC)); + + if (fact!=0.0) + { + assign_sd2sd(VOLUME,eta,rsd[0]); + scale_dble(VOLUME,1.0/fact,rsd[0]); + + rho=cgne(VOLUME,1,Dop,Dop_dble,ws,wsd,nmx,res,rsd[0],psi,status); + + scale_dble(VOLUME,fact,psi); + rho*=fact; + } + else + { + status[0]=0; + rho=0.0; + set_sd2zero(VOLUME,psi); + } + + release_wsd(); + release_wsd(); + release_ws(); + + if (status[0]<-1) + { + rho=rho0; + set_sd2zero(VOLUME,psi); + } + + return rho; +} + + +static void Doph(spinor *s,spinor *r) +{ + Dwhat(mus,s,r); + mulg5(VOLUME/2,r); + mus=-mus; +} + + +static void Doph_dble(spinor_dble *s,spinor_dble *r) +{ + Dwhat_dble(mud,s,r); + mulg5_dble(VOLUME/2,r); + mud=-mud; +} + + +double tmcgeo(int nmx,double res,double mu, + spinor_dble *eta,spinor_dble *psi,int *status) +{ + int ifail; + double rho,rho0,fact; + spinor **ws; + spinor_dble **rsd,**wsd; + + rho0=sqrt(norm_square_dble(VOLUME/2,1,eta)); + ifail=sw_term(ODD_PTS); + + if (ifail) + { + status[0]=-2; + rho=rho0; + } + else + { + if (query_flags(U_MATCH_UD)!=1) + assign_ud2u(); + + if ((query_flags(SW_UP2DATE)!=1)|| + (query_flags(SW_E_INVERTED)!=0)||(query_flags(SW_O_INVERTED)!=1)) + assign_swd2sw(); + + ws=reserve_ws(5); + wsd=reserve_wsd(2); + rsd=reserve_wsd(1); + + mus=(float)(mu); + mud=mu; + fact=rho0/sqrt((double)(VOLUME/2)*(double)(24*NPROC)); + + if (fact!=0.0) + { + assign_sd2sd(VOLUME/2,eta,rsd[0]); + scale_dble(VOLUME/2,1.0/fact,rsd[0]); + + rho=cgne(VOLUME/2,1,Doph,Doph_dble,ws,wsd,nmx,res,rsd[0],psi,status); + + scale_dble(VOLUME/2,fact,psi); + rho*=fact; + } + else + { + status[0]=0; + rho=0.0; + set_sd2zero(VOLUME/2,psi); + } + + release_wsd(); + release_wsd(); + release_ws(); + } + + if (status[0]<-1) + { + rho=rho0; + set_sd2zero(VOLUME/2,psi); + } + + return rho; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/tmcgm.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/tmcgm.c new file mode 100644 index 0000000000000000000000000000000000000000..eecc97ce2f4d8f874db511bc1edbe73bad989b77 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/tmcgm.c @@ -0,0 +1,126 @@ + +/******************************************************************************* +* +* File tmcgm.c +* +* Copyright (C) 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Multi-shift CG solver for the normal even-odd preconditioned Wilson-Dirac +* equation (Dwhat^dag*Dwhat+mu^2)*psi=eta with a twisted-mass term. +* +* The externally accessible function is +* +* void tmcgm(int nmx,double *res,int nmu,double *mu, +* spinor_dble *eta,spinor_dble **psi,int *status) +* Obtains approximate solutions psi[0],..,psi[nmu-1] of the normal +* even-odd preconditioned Wilson-Dirac equation for given source eta +* and nmu values of the twisted-mass parameter mu. See the notes for +* the explanation of the parameters of the program. +* +* Notes: +* +* The program is based on the multi-shift CG algorithm (see linsolv/mscg.c). +* It assumes that the improvement coefficients and the quark mass in the +* SW term have been set through set_lat_parms() and set_sw_parms() (see +* flags/lat_parms.c). +* +* All other parameters are passed through the argument list: +* +* nmx Maximal total number of CG iterations that may be performed. +* +* res Array of the desired maximal relative residues of the +* calculated solutions (nmu elements) +* +* nmu Number of twisted masses mu. +* +* mu Array of the twisted masses (nmu elements) +* +* eta Source field. Note that source fields must respect the chosen +* boundary conditions at time 0 and NPR0C0*L0-1, as has to be the +* the case for physical quark fields (see doc/dirac.pdf). +* +* psi Array of the calculated approximate solutions of the Dirac +* equations (Dwhat^dag*Dwhat+mu^2)*psi=eta (nmu elements). +* +* status If the program was able to solve the Dirac equations to the +* desired accuracy, status[0] reports the total number of CG +* iterations that were required. Negative values indicate that +* the program failed (-1: the algorithm did not converge, -2: +* the inversion of the SW term on the odd points was not safe). +* +* The source field eta must be different from psi[0],..,psi[nmu-1]. If +* status[0]>=-1 the calculated approximate solutions are returned. In +* all other cases, the fields are set to zero. +* +* The SW term is recalculated when needed. Evidently the solver is a global +* program that must be called on all processes simultaneously. The required +* workspace is +* +* spinor_dble 3+nmu (5 if nmu=1) +* +* (see utils/wspace.c). +* +*******************************************************************************/ + +#define TMCGM_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "linsolv.h" +#include "forces.h" +#include "global.h" + +static int iop=0; + + +static void Dop_dble(double mu,spinor_dble *s,spinor_dble *r) +{ + if (iop==0) + Dwhat_dble(mu,s,r); + else + Dwhat_dble(-mu,s,r); + + mulg5_dble(VOLUME/2,r); + iop^=0x1; +} + + +void tmcgm(int nmx,double *res,int nmu,double *mu, + spinor_dble *eta,spinor_dble **psi,int *status) +{ + int ifail,k; + spinor_dble **wsd; + + ifail=sw_term(ODD_PTS); + + if (ifail) + { + status[0]=-2; + + for (k=0;kr)} +* +* The contribution of the fields r,s to the X vector component on the link +* (x,x+mu) is given by +* +* X=tr{[gamma_5*(1-gamma_mu)*s(x+mu) x r^dag(x)]+(s<->r)} +* +* In all cases, the trace is taken over the Dirac indices only. +* +* The components of the X tensor field are of type u3_alg_dble. As in the +* case of symmetric gauge-field tensor, the field array includes additional +* space for the field components on the boundaries of the local lattice +* (see tcharge/ftensor.c and lattice/README.ftidx). The type u3_alg_dble +* is explained in the module su3fcts/su3prod.c. +* +* The programs in this module may perform global operations and must be +* called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define XTENSOR_C + +#include +#include +#include +#include "su3.h" +#include "utils.h" +#include "lattice.h" +#include "sw_term.h" +#include "sflds.h" +#include "linalg.h" +#include "forces.h" +#include "global.h" + +#define _u3_alg_mul_add_assign(r,c,s) \ + (r).c1+=(c)*(s).c1; \ + (r).c2+=(c)*(s).c2; \ + (r).c3+=(c)*(s).c3; \ + (r).c4+=(c)*(s).c4; \ + (r).c5+=(c)*(s).c5; \ + (r).c6+=(c)*(s).c6; \ + (r).c7+=(c)*(s).c7; \ + (r).c8+=(c)*(s).c8; \ + (r).c9+=(c)*(s).c9 + +#define _su3_mul_add_assign(r,c,s) \ + (r).c11.re+=(c)*(s).c11.re; \ + (r).c11.im+=(c)*(s).c11.im; \ + (r).c12.re+=(c)*(s).c12.re; \ + (r).c12.im+=(c)*(s).c12.im; \ + (r).c13.re+=(c)*(s).c13.re; \ + (r).c13.im+=(c)*(s).c13.im; \ + (r).c21.re+=(c)*(s).c21.re; \ + (r).c21.im+=(c)*(s).c21.im; \ + (r).c22.re+=(c)*(s).c22.re; \ + (r).c22.im+=(c)*(s).c22.im; \ + (r).c23.re+=(c)*(s).c23.re; \ + (r).c23.im+=(c)*(s).c23.im; \ + (r).c31.re+=(c)*(s).c31.re; \ + (r).c31.im+=(c)*(s).c31.im; \ + (r).c32.re+=(c)*(s).c32.re; \ + (r).c32.im+=(c)*(s).c32.im; \ + (r).c33.re+=(c)*(s).c33.re; \ + (r).c33.im+=(c)*(s).c33.im + +static u3_alg_dble X[6]; +static u3_alg_dble **xts=NULL,**xt; +static const su3_dble ud0={{0.0}}; +static su3_dble w ALIGNED16; +static su3_dble *xvs=NULL,*xv; + + +static void alloc_xts(void) +{ + int n,nt,nxt[6]; + u3_alg_dble **pp,*p; + ftidx_t *idx; + + idx=ftidx(); + nt=0; + + for (n=0;n<6;n++) + { + nxt[n]=VOLUME+idx[n].nft[0]+idx[n].nft[1]; + nt+=nxt[n]; + } + + pp=malloc(12*sizeof(*pp)); + p=amalloc(nt*sizeof(*p),ALIGN); + error((pp==NULL)||(p==NULL),1,"alloc_xts [xtensor.c]", + "Unable to allocate field arrays"); + + set_ualg2zero(nt,p); + xts=pp; + xt=pp+6; + + for (n=0;n<6;n++) + { + (*pp)=p; + pp+=1; + p+=nxt[n]; + } +} + + +u3_alg_dble **xtensor(void) +{ + int n; + + if (xts==NULL) + alloc_xts(); + + for (n=0;n<6;n++) + xt[n]=xts[n]; + + return xt; +} + + +void set_xt2zero(void) +{ + int n; + + if (xts==NULL) + alloc_xts(); + else + { + for (n=0;n<6;n++) + set_ualg2zero(VOLUME,xts[n]); + } +} + + +int add_det2xt(double c,ptset_t set) +{ + int n,ifail; + pauli_dble *m,*mm; + + if (set==NO_PTS) + return 0; + + ifail=sw_term(set); + + if (ifail!=0) + return ifail; + + if (xts==NULL) + alloc_xts(); + + if (set==ODD_PTS) + { + for (n=0;n<6;n++) + xt[n]=xts[n]+(VOLUME/2); + + m=swdfld()+VOLUME; + } + else + { + for (n=0;n<6;n++) + xt[n]=xts[n]; + + m=swdfld(); + } + + if (set==ALL_PTS) + mm=m+(2*VOLUME); + else + mm=m+VOLUME; + + for (;m=0.0 sets an upper bound on + the tolerated difference of the boundary values of the gauge field from + the expected ones in the case of SF and open-SF boundary conditions. + +int chs_ubnd(int ibc) + Multiplies the double-precision link variables on the time-like links + at time NPROC0*L0-1 by -1 if the following conditions are met: (1) ibc + and the determinants of the link variables have opposite sign, (2) the + boundary conditions are of type 3 (periodic for the gauge field). The + program returns 1 if the link variables are changed and 0 otherwise. + +void bnd_s2zero(ptset_t set,spinor *s) + Sets the components of the single-precision spinor field s on the + specified set of points at global time 0 and T (in the case of + open boundary conditions) to zero. + +void bnd_sd2zero(ptset_t set,spinor_dble *sd) + Sets the components of the double-precision spinor field sd on the + specified set of points at global time 0 and T (in the case of + open boundary conditions) to zero. + +ftidx_t *ftidx(void) + Returns an array idx[6] of ftidx_t structures containing the offsets + of the field tensor components on the boundaries of the local lattice + (see the file README.ftidx). + +void plaq_ftidx(int n,int ix,int *ip) + Calculates the offsets ip[4] of the field tensor components at the + corners of the (mu,nu)-plaquette at the point in the local lattice + with label ix. The indices (mu,nu) are determined by the parameter + n=0,..,5 (see the notes). + +int ipr_global(int n[]) + This program returns the number of the process with cartesian + coordinates n[0],..,n[3] in the process grid + +void ipt_global(int x[],int *ip,int *ix) + Given the coordinates x[0],..,x[3] of a point on the full lattice, + this program determines the number ip of the process that operates + on the corresponding local lattice and the associated local point + index ix (0<=ix=VOLUME are reserved for copies of the field tensor on the boundaries +of the local lattice in directions +mu and +nu: + + + Cross section of the lattice in the (mu,nu)-plane + + - - - - - - - - - - - - + + * * * * * * * * * * * * + *: local lattice + * * * * * * * * * * * * + + * * * * * * * * * * * * + +: mu-face + * * * * * * * * * * * * + nu + * * * * * * * * * * * * + ^ -: nu-face + * * * * * * * * * * * * + | + * * * * * * * * * * * * + ---> mu + + +The first of them, the "mu-face", includes the points at the (+mu,+nu) +corner of the local lattice. The numbers of points in these faces are +denoted by nft[n][0] an nft[n][1]. + + +Contents of the structures of type ftidx_t +------------------------------------------ + +A structure of type ftidx_t contains index data that refer to some +(mu,nu)-plane. The elements of the structure are + + nft[0]: Number of points in the mu-face. + + nft[1]: Number of points in the nu-face. + + ift[0][n]: Offsets of the field components in the local lattice + and the nu-face that correspond to the field components + in the mu-face on the MPI process in direction -mu + (n=0,..,nft[0]-1). + + ift[1][n]: Offsets of the field components in the local lattice + that correspond to the field components in the nu-face + on the MPI process in direction -nu (n=0,..,nft[1]-1). + +Using these index arrays, the field components on the mu- and nu-faces are +easily extracted from the local gauge fields on the neighbouring MPI +processes. + +Note that copying of the field tensor must be performed in a particular order +to ensure that the components at the (+mu,+nu)-corner of the local lattice are +correctly copied. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/README.uidx b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/README.uidx new file mode 100644 index 0000000000000000000000000000000000000000..ac600f7b94260a2156779b7a563a12ce5cd285f4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/README.uidx @@ -0,0 +1,121 @@ + +******************************************************************************** + + Layout of the double-precision gauge field array + +******************************************************************************** + +As explained in main/README.global, the gauge field on the local lattice is +defined by its values on the 8 links attached to the odd lattice points. The +integer offset from the base address of the link variable U(x,mu) at the odd +point x is + + 8*(ix-(VOLUME/2))+2*mu + +while the one of U(x-mu,mu) is + + 8*(ix-(VOLUME/2))+2*mu+1 + +where ix denotes the index of x on the local lattice. + +When the double-precision gauge field is allocated, space is reserved for +further 7*BNDRY/4 link variables at the end of the field array. The additional +space is used for copies of the link variables at the external boundaries of +the local lattice in the directions +0,+1,+2 and +3. + +When SF or open-SF boundary conditions are chosen, the boundary values of the +field at time T are stored in 3 link variables appended to the field array on +the MPI processes with cpr[0]=NPROC0-1. The total size of the array in this +case is thus 4*VOLUME+7*(BNDRY/4)+3, while in all other cases it is +4*VOLUME+7*(BNDRY/4). + + +Labeling of the boundary points +------------------------------- + +The faces in direction -0,+0,..,-3+3 of the local lattice are labeled by an +index ifc=0,1,..,7 and so are its exterior boundaries. In the following, the +term "boundary segment" is used for the set of even (or odd) exterior boundary +points in a given direction ifc. There are thus 16 boundary segments. + +Each point y in a boundary segment has a unique "partner point" x on the local +lattice such that |x-y|=1. The points in the local lattice are totally ordered +by their index ix=0,..,VOLUME-1. It is then natural to label the points y in +the boundary segment by an index ib=0,1,2,.. that respects the order of the +partner points x. + +If x and y are as above, and if y is on the face with label ifc, the index ib +of y is explicitly given by + + ib=iy-ofs[ifc] if y is even, + + ib=iy-ofs[ifc]-BNDRY/2 if y is odd, + +where iy is the index of y and + + ofs[0]=VOLUME + ofs[1]=ofs[0]+FACE0/2 + ofs[2]=ofs[1]+FACE0/2 + ofs[3]=ofs[2]+FACE1/2 + ofs[4]=ofs[3]+FACE1/2 + ofs[5]=ofs[4]+FACE2/2 + ofs[6]=ofs[5]+FACE2/2 + ofs[7]=ofs[6]+FACE3/2 + +Note that + + iy=iup[ix][mu] on the face in direction +mu, + + iy=idn[ix][mu] on the face in direction -mu. + +While the labeling of the points in a boundary segment is always taken to be +the one described here, the 16 boundary segments may be ordered in various +ways depending on the context. + + +Boundary fields +--------------- + +Along the faces in direction +mu, two kinds of link variables must be +distinguished (assuming x and y are as above): + +(1) The link variables on the link (x,y), where x is even. Note that these are + not part of the local gauge field. Their total number is equal to half the + number of points on the face. + +(2) The link variables U(y,nu) where nu!=mu. None of these is contained in the + local gauge field. Their total number is 3 times the number of points on + the face. + +The number of all these link variables on the faces in direction +0,+1,+2 and ++3 is thus equal to BNDRY/4+3*BNDRY/2=7*BNDRY/4. + +In the gauge field array, the link variables of type (1) in direction +0,+1,+2 +and +3 come just after the local gauge field at offset=4*VOLUME. Then follow +the link variables of type (2) at the even points y on the face in direction ++0, then those at the odd points y on that face, then those at the even points +y on the face in direction +1, and so on. Within each boundary segment, the +link variables are ordered in the same way as the points y. + + +Contents of the structures of type uidx_t +----------------------------------------- + +A structure of type uidx_t contains index data that refer to the face +in a direction +mu. The elements of the structure are + + nu0: Number of link variables of type (1) on the face. + + nuk: Number of link variables of type (2) on the face. + + iu0[n]: Offsets of the link variables on the neighbouring MPI process in + direction +mu that correspond to the link variables of type (1) + on the face (n=0,..,nu0-1). + + iuk[n]: Offsets of the link variables on the neighbouring MPI process in + direction +mu that correspond to the link variables of type (2) + on the face (n=0,..,nuk-1). + +Using these index arrays, the boundary link variables are easily extracted +from the local gauge fields on the neighbouring MPI processes and copied +to the current process. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/bcnds.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/bcnds.c new file mode 100644 index 0000000000000000000000000000000000000000..c6e5370235f3a4e94b547455b665450fdef363ad --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/bcnds.c @@ -0,0 +1,699 @@ + +/******************************************************************************* +* +* File bcnds.c +* +* Copyright (C) 2005, 2010-2014 Martin Luescher, John Bulava +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Programs related to the boundary conditions in the time direction. +* +* int *bnd_lks(int *n) +* Returns the starting address of an array of length n whose elements +* are the integer offsets of the time-like link variables on the local +* lattice at global time NPROC0*L0-1. +* +* int *bnd_pts(int *n) +* Returns the starting address of an array of length n whose elements +* are the indices of the points on the local lattice at global time 0 +* (boundary conditions type 0,1 or 2) and time NPROC0*L0-1 (boundary +* conditions type 0). The ordering of the indices is such that the n/2 +* even points come first. +* +* void set_bc(void) +* Sets the double-precision link variables at time 0 and T to the +* values required by the chosen boundary conditions (see the notes). +* +* int check_bc(double tol) +* Returns 1 if the double-precision gauge field has the proper boundary +* values and if no active link variables are equal to zero. Otherwise +* the program returns 0. The parameter tol>=0.0 sets an upper bound on +* the tolerated difference of the boundary values of the gauge field from +* the expected ones in the case of SF and open-SF boundary conditions. +* +* int chs_ubnd(int ibc) +* Multiplies the double-precision link variables on the time-like links +* at time NPROC0*L0-1 by -1 if the following conditions are met: (1) ibc +* and the determinants of the link variables have opposite sign, (2) the +* boundary conditions are of type 3 (periodic for the gauge field). The +* program returns 1 if the link variables are changed and 0 otherwise. +* +* void bnd_s2zero(ptset_t set,spinor *s) +* Sets the components of the single-precision spinor field s on the +* specified set of points at global time 0 (boundary conditions type +* 0,1 or 2) and time NPROC0*L0-1 (boundary conditions type 0) to zero. +* +* void bnd_sd2zero(ptset_t set,spinor_dble *sd) +* Sets the components of the double-precision spinor field sd on the +* specified set of points at global time 0 (boundary conditions type +* 0,1 or 2) and time NPROC0*L0-1 (boundary conditions type 0) to zero. +* +* Notes: +* +* The time extent T of the lattice is +* +* NPROC0*L0-1 for open boundary conditions, +* +* NPROC0*L0 for SF, open-SF and periodic boundary conditions. +* +* Note that in the latter cases the points at time T are not in the local +* lattice and are omitted in the programs bnd_pts(), bnd_s2zero() and +* bnd_sd2zero(). +* +* The action performed by set_bc() is the following: +* +* Open bc: Set all link variables U(x,0) at time T to zero. +* +* SF bc: Reads the boundary values of the gauge field from the +* data base and assigns them to the link variables at +* time 0 and T. At time T the link variables are stored +* in the buffers appended to the local field on the MPI +* processes where cpr[0]=NPROC0-1. +* +* Open-SF bc: Same as SF bc, but omitting the assignment of the link +* variables at time 0. +* +* Periodic bc: No action is performed. +* +* Then the program checks whether any active link variables are equal to +* zero and, if some are found, aborts the program with an error message. +* +* The programs in this module act globally and should be called simultaneously +* on all MPI processes. After the first time, the programs bnd_s2zero() and +* bnd_sd2zero() may be locally called. +* +*******************************************************************************/ + +#define BCNDS_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "uflds.h" +#include "lattice.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +typedef union +{ + su3_dble u; + double r[18]; +} umat_t; + +static int init0=0,nlks,*lks; +static int init1=0,npts,*pts; +static int init2=0; +static const su3_dble ud0={{0.0}}; +static const spinor s0={{{0.0f}}}; +static const spinor_dble sd0={{{0.0}}}; +static su3_dble ubnd[2][3]; + + +static void alloc_lks(void) +{ + int ix,t,*lk; + + error(iup[0][0]==0,1,"alloc_lks [bcnds.c]","Geometry arrays are not set"); + + if ((cpr[0]==0)||(cpr[0]==(NPROC0-1))) + { + if (NPROC0>1) + nlks=(L1*L2*L3)/2; + else + nlks=L1*L2*L3; + + lks=malloc(nlks*sizeof(*lks)); + + if (lks!=NULL) + { + lk=lks; + + for (ix=(VOLUME/2);ix0)&&(lks==NULL),1,"alloc_lks [bcnds.c]", + "Unable to allocate index array"); + init0=1; +} + + +static void alloc_pts(void) +{ + int bc,ix,t,*pt; + + error(iup[0][0]==0,1,"alloc_pts [bcnds.c]","Geometry arrays are not set"); + bc=bc_type(); + + if (((cpr[0]==0)&&(bc!=3))||((cpr[0]==(NPROC0-1))&&(bc==0))) + { + if ((NPROC0==1)&&(bc==0)) + npts=2*L1*L2*L3; + else + npts=L1*L2*L3; + + pts=malloc(npts*sizeof(*pts)); + + if (pts!=NULL) + { + pt=pts; + + for (ix=0;ix0)&&(pts==NULL),1,"alloc_pts [bcnds.c]", + "Unable to allocate index array"); + init1=1; +} + + +int *bnd_lks(int *n) +{ + if (init0==0) + alloc_lks(); + + (*n)=nlks; + + return lks; +} + + +int *bnd_pts(int *n) +{ + if (init1==0) + alloc_pts(); + + (*n)=npts; + + return pts; +} + + +static int is_zero(su3_dble *u) +{ + int i,it; + umat_t *um; + + um=(umat_t*)(u); + it=1; + + for (i=0;i<18;i++) + it&=((*um).r[i]==0.0); + + return it; +} + + +static int is_equal(double tol,su3_dble *u,su3_dble *v) +{ + int i,it; + umat_t *um,*vm; + + um=(umat_t*)(u); + vm=(umat_t*)(v); + it=1; + + for (i=0;i<18;i++) + it&=(fabs((*um).r[i]-(*vm).r[i])<=tol); + + return it; +} + + +static int check_zero(int bc) +{ + int it,ix,t,ifc; + su3_dble *u; + + it=1; + u=udfld(); + + for (ix=(VOLUME/2);ix1) + { + dprms[0]=tol; + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + error(dprms[0]!=tol,1,"check_bc [bcnds.c]","Parameter is not global"); + } + + bc=bc_type(); + it=check_zero(bc); + + if (bc==1) + it&=check_SF(tol); + else if (bc==2) + it&=check_openSF(tol); + + if (NPROC>1) + { + is=it; + MPI_Allreduce(&is,&it,1,MPI_INT,MPI_MIN,MPI_COMM_WORLD); + } + + return it; +} + + +static int sdet(su3_dble *u) +{ + double r; + complex_dble z; + + z.re= + (*u).c22.re*(*u).c33.re-(*u).c22.im*(*u).c33.im- + (*u).c32.re*(*u).c23.re+(*u).c32.im*(*u).c23.im; + + z.im= + (*u).c22.re*(*u).c33.im+(*u).c22.im*(*u).c33.re- + (*u).c32.re*(*u).c23.im-(*u).c32.im*(*u).c23.re; + + r=(*u).c11.re*z.re-(*u).c11.im*z.im; + + z.re= + (*u).c32.re*(*u).c13.re-(*u).c32.im*(*u).c13.im- + (*u).c12.re*(*u).c33.re+(*u).c12.im*(*u).c33.im; + + z.im= + (*u).c32.re*(*u).c13.im+(*u).c32.im*(*u).c13.re- + (*u).c12.re*(*u).c33.im-(*u).c12.im*(*u).c33.re; + + r+=((*u).c21.re*z.re-(*u).c21.im*z.im); + + z.re= + (*u).c12.re*(*u).c23.re-(*u).c12.im*(*u).c23.im- + (*u).c22.re*(*u).c13.re+(*u).c22.im*(*u).c13.im; + + z.im= + (*u).c12.re*(*u).c23.im+(*u).c12.im*(*u).c23.re- + (*u).c22.re*(*u).c13.im-(*u).c22.im*(*u).c13.re; + + r+=((*u).c31.re*z.re-(*u).c31.im*z.im); + + if (r>=0.0) + return 1; + else + return -1; +} + + +int chs_ubnd(int ibc) +{ + int iprms[1],i,ich,ichs; + int *lk,*lkm; + su3_dble *ub; + umat_t *um; + + if (bc_type()==3) + { + if (NPROC>1) + { + iprms[0]=ibc; + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + error(iprms[0]!=ibc,1,"chs_ubnd [bcnds.c]", + "Parameter is not global"); + } + + if (init0==0) + alloc_lks(); + + if (ibc>=0) + ibc=1; + else + ibc=-1; + + ub=udfld(); + ich=0; + + if (nlks>0) + { + lk=lks; + + if (sdet(ub+(*lk))!=ibc) + { + ich=1; + lkm=lk+nlks; + + for (;lk0) + { + if (set==ALL_PTS) + { + pt=pts; + pm=pts+npts; + } + else if (set==EVEN_PTS) + { + pt=pts; + pm=pts+npts/2; + } + else if (set==ODD_PTS) + { + pt=pts+npts/2; + pm=pts+npts; + } + else + return; + + for (;pt0) + { + if (set==ALL_PTS) + { + pt=pts; + pm=pts+npts; + } + else if (set==EVEN_PTS) + { + pt=pts; + pm=pts+npts/2; + } + else if (set==ODD_PTS) + { + pt=pts+npts/2; + pm=pts+npts; + } + else + return; + + for (;pt F_{mu nu}(x) +* ip[1] -> F_{mu nu}(x+mu) +* ip[2] -> F_{mu nu}(x+nu) +* ip[3] -> F_{mu nu}(x+mu+nu) +* +* In the program plaq_ftidx() it is taken for granted that 0<=ix +#include +#include +#include "su3.h" +#include "utils.h" +#include "lattice.h" +#include "global.h" + +static const int plns[6][2]={{0,1},{0,2},{0,3},{2,3},{3,1},{1,2}}; +static int nfc[4],ofs[4],*cn[6][2],init=0; +static ftidx_t idx[6]; + + +static void set_nft(void) +{ + int bs[4]; + int n,mu,nu; + + bs[0]=L0; + bs[1]=L1; + bs[2]=L2; + bs[3]=L3; + + nfc[0]=FACE0; + nfc[1]=FACE1; + nfc[2]=FACE2; + nfc[3]=FACE3; + + ofs[0]=VOLUME; + ofs[1]=ofs[0]+FACE0; + ofs[2]=ofs[1]+FACE1; + ofs[3]=ofs[2]+FACE2; + + for (n=0;n<6;n++) + { + mu=plns[n][0]; + nu=plns[n][1]; + + idx[n].nft[0]=nfc[mu]; + idx[n].nft[1]=nfc[nu]; + + if (nfc[nu]>0) + idx[n].nft[0]+=(nfc[mu]/bs[nu]); + } +} + + +static void alloc_idx(void) +{ + int n,mu,nu; + int np,*iw; + + set_nft(); + np=0; + + for (n=0;n<6;n++) + np+=(idx[n].nft[0]+idx[n].nft[1]); + + if (BNDRY>0) + { + iw=malloc((np+9*(BNDRY/2))*sizeof(*iw)); + error(iw==NULL,1,"alloc_idx [ftidx.c]", + "Unable to allocate index arrays"); + } + else + iw=NULL; + + for (n=0;n<6;n++) + { + idx[n].ift[0]=iw; + iw+=idx[n].nft[0]; + + idx[n].ift[1]=iw; + iw+=idx[n].nft[1]; + } + + for (n=0;n<6;n++) + { + mu=plns[n][0]; + nu=plns[n][1]; + + cn[n][0]=iw; + iw+=3*nfc[mu]; + + cn[n][1]=iw; + iw+=3*nfc[nu]; + } +} + + +static int ibnd(int mu,int iy) +{ + if (iy>(VOLUME+(BNDRY/2))) + return iy-ofs[mu]-BNDRY/2; + else + return iy-ofs[mu]-nfc[mu]/2; +} + + +static void set_idx(void) +{ + int n,mu,nu; + int ix,iy,iw,iz; + int iby,ibw,ibz; + int *ift[2],*cnn[2],nft0,nfc0,icn; + + alloc_idx(); + + for (n=0;n<6;n++) + { + mu=plns[n][0]; + nu=plns[n][1]; + + ift[0]=idx[n].ift[0]; + ift[1]=idx[n].ift[1]; + cnn[0]=cn[n][0]; + cnn[1]=cn[n][1]; + + nft0=idx[n].nft[0]; + nfc0=nfc[mu]; + icn=0; + + for (ix=0;ix=VOLUME) + { + iby=ibnd(mu,iy); + ift[0][iby]=map[iy-VOLUME]; + + if (iw>=VOLUME) + { + ibw=ibnd(nu,iw); + ift[1][ibw]=map[iw-VOLUME]; + + iz=map[iy-VOLUME]; + iz=iup[iz][nu]; + ibz=ibnd(nu,iz); + ift[0][nfc0+icn]=VOLUME+nft0+ibz; + + cnn[0][3*iby ]=VOLUME+iby; + cnn[0][3*iby+1]=VOLUME+nft0+ibw; + cnn[0][3*iby+2]=VOLUME+nfc0+icn; + + cnn[1][3*ibw ]=cnn[0][3*iby ]; + cnn[1][3*ibw+1]=cnn[0][3*iby+1]; + cnn[1][3*ibw+2]=cnn[0][3*iby+2]; + + icn+=1; + } + else + { + iz=iup[iw][mu]; + ibz=ibnd(mu,iz); + + cnn[0][3*iby ]=VOLUME+iby; + cnn[0][3*iby+1]=iw; + cnn[0][3*iby+2]=VOLUME+ibz; + } + } + else if (iw>=VOLUME) + { + ibw=ibnd(nu,iw); + ift[1][ibw]=map[iw-VOLUME]; + + iz=iup[iy][nu]; + ibz=ibnd(nu,iz); + + cnn[1][3*ibw ]=iy; + cnn[1][3*ibw+1]=VOLUME+nft0+ibw; + cnn[1][3*ibw+2]=VOLUME+nft0+ibz; + } + } + } + + init=1; +} + + +ftidx_t *ftidx(void) +{ + if (init==0) + set_idx(); + + return idx; +} + + +void plaq_ftidx(int n,int ix,int *ip) +{ + int mu,nu; + int iy,iw,k; + + if (init==0) + set_idx(); + + mu=plns[n][0]; + nu=plns[n][1]; + + iy=iup[ix][mu]; + iw=iup[ix][nu]; + ip[0]=ix; + + if (iy>=VOLUME) + { + k=3*ibnd(mu,iy); + ip[1]=cn[n][0][k]; + ip[2]=cn[n][0][k+1]; + ip[3]=cn[n][0][k+2]; + } + else if (iw>=VOLUME) + { + k=3*ibnd(nu,iw); + ip[1]=cn[n][1][k]; + ip[2]=cn[n][1][k+1]; + ip[3]=cn[n][1][k+2]; + } + else + { + ip[1]=iy; + ip[2]=iw; + ip[3]=iup[iy][nu]; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/geometry.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/geometry.c new file mode 100644 index 0000000000000000000000000000000000000000..345ef95994f878b38aa2835354d174f8a8dc3762 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/geometry.c @@ -0,0 +1,693 @@ + +/******************************************************************************* +* +* File geometry.c +* +* Copyright (C) 2005, 2008, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Programs related to the lattice and block geometry. +* +* The externally accessible functions are +* +* int ipr_global(int *n) +* This program returns the rank of the MPI process with Cartesian +* coordinates n[0],..,n[3] in the process grid. +* +* void ipt_global(int *x,int *ip,int *ix) +* Given the Cartesian coordinates x[0],..,x[3] of a point on the full +* lattice, this program finds the local lattice containing x. On exit +* the rank of the associated MPI process is assigned to ip and the +* local index of the point to ix. +* +* int global_time(int ix) +* Returns the (global) time coordinate of the lattice point with local +* index ix. +* +* void geometry(void) +* Computes the global arrays cpr,npr describing the MPI process grid +* and the index arrays ipt,iup,idn and map that characterize the lattice +* geometry (see main/global.h). +* +* void blk_geometry(block_t *b) +* Computes the index arrays b.ipt,b.iup and b.idn that describe the +* geometry of the block b. +* +* void blk_imbed(block_t *b) +* Computes the index arrays b.imb and b.ibp that describe the +* embedding of the block b in the full lattice. +* +* void bnd_geometry(block_t *b) +* Computes the index arrays bb.ipp and bb.map that describe the +* geometry of the exterior boundaries bb of the block b. +* +* void bnd_imbed(block_t *b) +* Computes the index arrays bb.imb that describe the embedding +* of the exterior boundaries bb of the block b in the full lattice. +* +* Notes: +* +* See main/README.global for a description of the lattice geometry and +* block/README.block for explanations of the block structure. +* +* The programs geometry() and blk_geometry() may involve communications and +* must be called simultaneously on all processes. All other programs can be +* called locally. +* +*******************************************************************************/ + +#define GEOMETRY_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "global.h" + +#define NPROC_BLK (NPROC0_BLK*NPROC1_BLK*NPROC2_BLK*NPROC3_BLK) +#define NBLK0 (NPROC0/NPROC0_BLK) +#define NBLK1 (NPROC1/NPROC1_BLK) +#define NBLK2 (NPROC2/NPROC2_BLK) +#define NBLK3 (NPROC3/NPROC3_BLK) + +static int cbs[4],cbn[4],*cbix=NULL; +static int *tms=NULL; + + +int ipr_global(int *n) +{ + int ib,ip; + int n0,n1,n2,n3; + int nb0,nb1,nb2,nb3; + int np0,np1,np2,np3; + + n0=safe_mod(n[0],NPROC0); + n1=safe_mod(n[1],NPROC1); + n2=safe_mod(n[2],NPROC2); + n3=safe_mod(n[3],NPROC3); + + nb0=n0/NPROC0_BLK; + nb1=n1/NPROC1_BLK; + nb2=n2/NPROC2_BLK; + nb3=n3/NPROC3_BLK; + + np0=n0%NPROC0_BLK; + np1=n1%NPROC1_BLK; + np2=n2%NPROC2_BLK; + np3=n3%NPROC3_BLK; + + ib=nb0; + ib=ib*NBLK1+nb1; + ib=ib*NBLK2+nb2; + ib=ib*NBLK3+nb3; + + ip=np0; + ip=ip*NPROC1_BLK+np1; + ip=ip*NPROC2_BLK+np2; + ip=ip*NPROC3_BLK+np3; + + return ip+ib*NPROC_BLK; +} + + +void ipt_global(int *x,int *ip,int *ix) +{ + int x0,x1,x2,x3; + int n[4]; + + x0=safe_mod(x[0],NPROC0*L0); + x1=safe_mod(x[1],NPROC1*L1); + x2=safe_mod(x[2],NPROC2*L2); + x3=safe_mod(x[3],NPROC3*L3); + + n[0]=x0/L0; + n[1]=x1/L1; + n[2]=x2/L2; + n[3]=x3/L3; + + (*ip)=ipr_global(n); + + x0=x0%L0; + x1=x1%L1; + x2=x2%L2; + x3=x3%L3; + + (*ix)=ipt[x3+L3*x2+L2*L3*x1+L1*L2*L3*x0]; +} + + +int global_time(int ix) +{ + if ((tms!=NULL)&&(ix>=0)&&(ix=NPROC),1,"set_cpr [geometry.c]", + "Rank of process is out of range"); + + ib=nr/NPROC_BLK; + ip=nr%NPROC_BLK; + + cpr[3]=(ib%NBLK3)*NPROC3_BLK+(ip%NPROC3_BLK); + ib/=NBLK3; + ip/=NPROC3_BLK; + + cpr[2]=(ib%NBLK2)*NPROC2_BLK+(ip%NPROC2_BLK); + ib/=NBLK2; + ip/=NPROC2_BLK; + + cpr[1]=(ib%NBLK1)*NPROC1_BLK+(ip%NPROC1_BLK); + ib/=NBLK1; + ip/=NPROC1_BLK; + + cpr[0]=ib*NPROC0_BLK+ip; +} + + +static void set_npr(void) +{ + int mu,n[4]; + + for (mu=0;mu<4;mu++) + n[mu]=cpr[mu]; + + for (mu=0;mu<4;mu++) + { + n[mu]-=1; + npr[2*mu]=ipr_global(n); + n[mu]+=2; + npr[2*mu+1]=ipr_global(n); + n[mu]-=1; + } +} + + +static void cache_block(int *bs) +{ + int mu; + + cbs[0]=bs[0]; + cbn[0]=1; + + for (mu=1;mu<4;mu++) + { + if ((bs[mu]%4)==0) + cbs[mu]=4; + else if ((bs[mu]%3)==0) + cbs[mu]=3; + else if ((bs[mu]%2)==0) + cbs[mu]=2; + else + cbs[mu]=1; + + cbn[mu]=bs[mu]/cbs[mu]; + } + + if (cbix!=NULL) + free(cbix); + + cbix=malloc(cbs[0]*cbs[1]*cbs[2]*cbs[3]*sizeof(*cbix)); + error(cbix==NULL,1,"cache_block [geometry.c]", + "Unable to allocate auxiliary array"); +} + + +static void set_cbix(void) +{ + int x0,x1,x2,x3; + int ig,iu,ib,is; + + ig=0; + iu=0; + + for (x0=0;x01)) + iup[ix][0]=VOLUME; + if ((x0==0)&&(NPROC0>1)) + idn[ix][0]=VOLUME; + + if ((x1==(L1-1))&&(NPROC1>1)) + iup[ix][1]=VOLUME; + if ((x1==0)&&(NPROC1>1)) + idn[ix][1]=VOLUME; + + if ((x2==(L2-1))&&(NPROC2>1)) + iup[ix][2]=VOLUME; + if ((x2==0)&&(NPROC2>1)) + idn[ix][2]=VOLUME; + + if ((x3==(L3-1))&&(NPROC3>1)) + iup[ix][3]=VOLUME; + if ((x3==0)&&(NPROC3>1)) + idn[ix][3]=VOLUME; + } + } + } + } + + ifc[0]=0; + ifc[1]=ifc[0]+(FACE0/2); + ifc[2]=ifc[1]+(FACE0/2); + ifc[3]=ifc[2]+(FACE1/2); + ifc[4]=ifc[3]+(FACE1/2); + ifc[5]=ifc[4]+(FACE2/2); + ifc[6]=ifc[5]+(FACE2/2); + ifc[7]=ifc[6]+(FACE3/2); + + for (ix=0;ix0) + (*b).idn[ix][0]=index(bo,bs,x0-1,x1,x2,x3); + else + (*b).idn[ix][0]=(*b).vol; + + if ((x1+1)0) + (*b).idn[ix][1]=index(bo,bs,x0,x1-1,x2,x3); + else + (*b).idn[ix][1]=(*b).vol; + + if ((x2+1)0) + (*b).idn[ix][2]=index(bo,bs,x0,x1,x2-1,x3); + else + (*b).idn[ix][2]=(*b).vol; + + if ((x3+1)0) + (*b).idn[ix][3]=index(bo,bs,x0,x1,x2,x3-1); + else + (*b).idn[ix][3]=(*b).vol; + } + } + } + } + + (*b).ipt[(*b).vol]=(*b).ipt[0]; + + free(cbix); + cbix=NULL; +} + + +void blk_imbed(block_t *b) +{ + int *bo,*bs; + int x0,x1,x2,x3; + int ix,iy,ibd,ibu,*ibp; + + bo=(*b).bo; + bs=(*b).bs; + + for (x0=0;x0=VOLUME) + (*bb).ibn=1; + else + (*bb).ibn=0; + + bb+=1; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/uidx.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/uidx.c new file mode 100644 index 0000000000000000000000000000000000000000..0dc239d2cd06e17da07540b964824ced590ccd5d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/uidx.c @@ -0,0 +1,262 @@ + +/******************************************************************************* +* +* File uidx.c +* +* Copyright (C) 2010, 2011, 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Labeling of the link variables on the faces of the local lattice. +* +* The externally accessible functions are +* +* uidx_t *uidx(void) +* Returns an array idx[4] of uidx_t structures containing the offsets +* of the link variables at the faces of the local lattice. +* +* void plaq_uidx(int n,int ix,int *ip) +* Calculates the offsets ip[4] of the links in the (mu,nu)-plaquette at +* the point on the local lattice with label ix. The indices (mu,nu) are +* determined by the parameter n=0,..,5. +* +* Notes: +* +* The layout of the double-precision gauge field array and contents of the +* index structures returned by uidx() are described in the file README.uidx +* in this directory. The index arrays calculated by uidx() are determined +* by the local geometry of the lattice and are therefore independent of the +* boundary conditions. +* +* There are six planes +* +* (mu,nu)={(0,1),(0,2),(0,3),(2,3),(3,1),(1,2)} +* +* labeled by an integer n running from 0 to 5 and the links in the +* (mu,nu)-plaquette at the point x are ordered such that +* +* ip[0] -> U(x,mu) +* ip[1] -> U(x+mu,nu) +* ip[2] -> U(x,nu) +* ip[3] -> U(x+nu,mu) +* +* In the program plaq_uidx() it is taken for granted that 0<=ix +#include +#include +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +static const int plns[6][2]={{0,1},{0,2},{0,3},{2,3},{3,1},{1,2}}; +static int type,nfc[4],ofs[4],snu[4],init=0; +static uidx_t idx[4]; + + +static void alloc_idx(void) +{ + int mu,nu0,nuk; + int *iu0,*iuk; + + error(iup[0][0]==0,1,"alloc_idx [uidx.c]", + "Geometry arrays are not set"); + + type=bc_type(); + nfc[0]=FACE0/2; + nfc[1]=FACE1/2; + nfc[2]=FACE2/2; + nfc[3]=FACE3/2; + + ofs[0]=VOLUME+(FACE0/2); + ofs[1]=ofs[0]+(FACE0/2)+(FACE1/2); + ofs[2]=ofs[1]+(FACE1/2)+(FACE2/2); + ofs[3]=ofs[2]+(FACE2/2)+(FACE3/2); + + snu[0]=0; + snu[1]=snu[0]+(FACE0/2); + snu[2]=snu[1]+(FACE1/2); + snu[3]=snu[2]+(FACE2/2); + + if (BNDRY>0) + { + iu0=malloc(7*(BNDRY/4)*sizeof(*iu0)); + error(iu0==NULL,1,"alloc_idx [uidx.c]", + "Unable to allocate index array"); + iuk=iu0+(BNDRY/4); + } + else + { + iu0=NULL; + iuk=NULL; + } + + for (mu=0;mu<4;mu++) + { + nu0=nfc[mu]; + nuk=6*nfc[mu]; + + idx[mu].nu0=nu0; + idx[mu].nuk=nuk; + + if (nu0>0) + { + idx[mu].iu0=iu0; + idx[mu].iuk=iuk; + iu0+=nu0; + iuk+=nuk; + } + else + { + idx[mu].iu0=NULL; + idx[mu].iuk=NULL; + } + } +} + + +static int offset(int ix,int mu) +{ + int iy,ib; + + if (ix<(VOLUME/2)) + { + iy=iup[ix][mu]; + + if (iy=mu); + iuk[3*ib+k]=offset(iz,nu); + } + } + + for (ib=0;ib=mu); + iuk[3*(ib+nu0)+k]=offset(iz,nu); + } + } + } + + init=1; +} + + +uidx_t *uidx(void) +{ + if (init==0) + set_idx(); + + return idx; +} + + +void plaq_uidx(int n,int ix,int *ip) +{ + int mu,nu; + int iy,ic; + + if (init==0) + set_idx(); + + mu=plns[n][0]; + nu=plns[n][1]; + + ip[0]=offset(ix,mu); + + if ((mu==0)&&(global_time(ix)==(N0-1))&&((type==1)||(type==2))) + { + ip[1]=4*VOLUME+7*(BNDRY/4)+nu-1; + } + else + { + iy=iup[ix][mu]; + + if (iymu); + } + } + + ip[2]=offset(ix,nu); + iy=iup[ix][nu]; + + if (iynu); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linalg/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linalg/README new file mode 100644 index 0000000000000000000000000000000000000000..ef307cb4de26df5202e6a985831accfbb5c1d39b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linalg/README @@ -0,0 +1,233 @@ + +******************************************************************************** + + Linear algebra + +******************************************************************************** + + +Files +----- + +cmatrix.c Complex matrix algebra (single-precision version) + +cmatrix_dble.c Complex matrix algebra (double-precision version) + +liealg.c Basic functions for fields with values in the Lie + algebra of SU(3) + +salg.c Generic linear algebra routines for single-precision + spinor fields + +salg_dble.c Generic linear algebra routines for double-precision + spinor fields + +valg.c Generic linear algebra routines for single-precision + complex fields + +valg_dble.c Generic linear algebra routines for double-precision + complex fields + + +Include file +------------ + +The file linalg.h defines the prototypes for all externally accessible +functions that are defined in the *.c files listed above. + + +List of functions +----------------- + +void cmat_vec(int n,complex *a,complex *v,complex *w) + Computes w=a*v, where v and w are n-vectors and a an nxn matrix. + +void cmat_vec_assign(int n,complex *a,complex *v,complex *w) + Adds a*v to w, where v and w are n-vectors and a an nxn matrix. + +void cmat_add(int n,complex *a,complex *b,complex *c) + Computes the sum c=a+b of two nxn matrices a and b. + +void cmat_sub(int n,complex *a,complex *b,complex *c) + Computes the difference c=a-b of two nxn matrices a and b. + +void cmat_mul(int n,complex *a,complex *b,complex *c) + Computes the product c=a*b of two nxn matrices a and b. + +void cmat_dag(int n,complex *a,complex *b) + Assigns the hermitian conjugate of a to b. + +void cmat_vec_dble(int n,complex_dble *a,complex_dble *v,complex_dble *w) + Computes w=a*v, where v and w are n-vectors and a an nxn matrix. + +void cmat_vec_assign_dble(int n,complex_dble *a,complex_dble *v, + complex_dble *w) + Adds a*v to w, where v and w are n-vectors and a an nxn matrix. + +void cmat_add_dble(int n,complex_dble *a,complex_dble *b,complex_dble *c) + Computes the sum c=a+b of two nxn matrices a and b. + +void cmat_sub_dble(int n,complex_dble *a,complex_dble *b,complex_dble *c) + Computes the difference c=a-b of two nxn matrices a and b. + +void cmat_mul_dble(int n,complex_dble *a,complex_dble *b,complex_dble *c) + Computes the product c=a*b of two nxn matrices a and b. + +void cmat_dag_dble(int n,complex_dble *a,complex_dble *b) + Assigns the hermitian conjugate of a to b. + +int cmat_inv_dble(int n,complex_dble *a,complex_dble *b,double *k) + Computes the inverse b of the nxn matrix a, using Householder + reflections. The Frobenius condition number k of a is also computed. + A non-zero return value indicates that the input matrix was found to + be singular within rounding errors and that the program terminated + prematurely. + +void random_alg(int vol,su3_alg_dble *X) + Initializes the Lie algebra elements X to random values + with distribution proportional to exp{tr[X^2]}. + +double norm_square_alg(int vol,int icom,su3_alg_dble *X) + Computes the square of the norm of the norm squared of the field X. + +double scalar_prod_alg(int vol,int icom,su3_alg_dble *X,su3_alg_dble *Y) + Computes the scalar product of the fields X and Y. + +void set_alg2zero(int vol,su3_alg_dble *X) + Sets the array elements X to zero. + +void set_ualg2zero(int vol,u3_alg_dble *X) + Sets the array elements X to zero. + +void assign_alg2alg(int vol,su3_alg_dble *X,su3_alg_dble *Y) + Assigns the field X to the field Y. + +void swap_alg(int vol,su3_alg_dble *X,su3_alg_dble *Y) + Swaps the fields X and Y. + +void muladd_assign_alg(int vol,double r,su3_alg_dble *X,su3_alg_dble *Y) + Adds r*X to Y. + +complex spinor_prod(int vol,int icom,spinor *s,spinor *r) + Computes the scalar product of the fields s and r. + +float spinor_prod_re(int vol,int icom,spinor *s,spinor *r) + Computes the real part of the scalar product of the fields + s and r. + +float norm_square(int vol,int icom,spinor *s) + Computes the square of the norm of the field s. + +void mulc_spinor_add(int vol,spinor *s,spinor *r,complex z) + Replaces the field s by s+z*r. + +void mulr_spinor_add(int vol,spinor *s,spinor *r,float c) + Replaces the field s by s+c*r. + +void project(int vol,int icom,spinor *s,spinor *r) + Replaces the field s by s-(r,s)*r. + +void scale(int vol,float c,spinor *s) + Replaces the field s by c*s. + +float normalize(int vol,int icom,spinor *s) + Replaces the field s by s/||s|| and returns the norm ||s||. + +void rotate(int vol,int n,spinor **ppk,complex *v) + Replaces the fields pk[] by sum_j pj*v[n*j+k] where 0<=k,j +#include +#include +#include "su3.h" +#include "utils.h" +#include "linalg.h" + +#if (defined AVX) +#include "avx.h" + +void cmat_vec(int n,complex *a,complex *v,complex *w) +{ + complex *b,*vv,*vm,*wm; + + if ((n&0x3)==0x0) + { + vm=v+n; + wm=w+n; + b=a; + + for (;w +#include +#include +#include +#include "su3.h" +#include "utils.h" +#include "linalg.h" + +#ifndef ALIGN +#define ALIGN 6 +#endif + +static int nmax=0; +static double *rsv; +static complex_dble *dsv; + +#if (defined AVX) +#include "avx.h" + +void cmat_vec_dble(int n,complex_dble *a,complex_dble *v,complex_dble *w) +{ + complex_dble *vv,*vm,*wm;; + + if ((n&0x3)==0x0) + { + vm=v+n; + wm=w+n; + + for (;w0) + { + nmax=0; + afree(rsv); + afree(dsv); + rsv=NULL; + dsv=NULL; + } + + if (n>0) + { + rsv=amalloc(n*sizeof(*rsv),ALIGN); + dsv=amalloc(n*sizeof(*dsv),ALIGN); + + if (error_loc((rsv==NULL)||(dsv==NULL),1,"alloc_arrays [cmatrix_dble.c]", + "Unable to allocate auxiliary arrays")==0) + { + nmax=n; + return 0; + } + else + { + if (rsv!=NULL) + afree(rsv); + if (dsv!=NULL) + afree(dsv); + rsv=NULL; + dsv=NULL; + + return 1; + } + } + + return 0; +} + + +static int fwd_house(int n,complex_dble *a,complex_dble *b,double *fnsq) +{ + int i,j,k; + double eps,r1,r2,r3; + complex_dble z,*bb,*bm,*bk,*bj; + + *fnsq=0.0; + bm=b+n*n; + + for (bb=b;bb=eps) + r1=sqrt(r1); + else + return 3; + + if (r2>=(DBL_EPSILON*r1)) + { + r3=1.0/r2; + z.re=r3*b[n*k+k].re; + z.im=r3*b[n*k+k].im; + } + else + { + z.re=1.0; + z.im=0.0; + } + + b[n*k+k].re+=r1*z.re; + b[n*k+k].im+=r1*z.im; + + r3=1.0/(r1*(r1+r2)); + rsv[k]=r3; + dsv[k].re=-(r1+r2)*r3*z.re; + dsv[k].im= (r1+r2)*r3*z.im; + + for (j=(k+1);j=eps) + r1=1.0/r1; + else + return 3; + + dsv[n-1].re= r1*(*bb).re; + dsv[n-1].im=-r1*(*bb).im; + + return 0; +} + + +static void solv_sys(int n,complex_dble *b) +{ + int i,j,k; + complex_dble *bi,*bk,z; + + for (k=(n-1);k>0;k--) + { + for (i=(k-1);i>=0;i--) + { + bi=b+n*i+k; + bk=b+n*k-n+k; + z.re=(*bi).re*dsv[k].re-(*bi).im*dsv[k].im; + z.im=(*bi).re*dsv[k].im+(*bi).im*dsv[k].re; + + for (j=(k-1);j>i;j--) + { + bi-=1; + z.re+=((*bi).re*(*bk).re-(*bi).im*(*bk).im); + z.im+=((*bi).re*(*bk).im+(*bi).im*(*bk).re); + bk-=n; + } + + (*bk).re=-dsv[i].re*z.re+dsv[i].im*z.im; + (*bk).im=-dsv[i].re*z.im-dsv[i].im*z.re; + } + } +} + + +static void bck_house(int n,complex_dble *b) +{ + int i,j,k; + complex_dble *bi,*dj,z; + + b[n*n-1].re=dsv[n-1].re; + b[n*n-1].im=dsv[n-1].im; + + for (k=(n-2);k>=0;k--) + { + z.re=dsv[k].re; + z.im=dsv[k].im; + dsv[k].re=b[n*k+k].re; + dsv[k].im=b[n*k+k].im; + b[n*k+k].re=z.re; + b[n*k+k].im=z.im; + + for (j=(k+1);jnmax) + { + if (alloc_arrays(n)!=0) + return 1; + } + + ie=fwd_house(n,a,b,&fnsq); + + if (ie!=0) + return ie; + + solv_sys(n,b); + bck_house(n,b); + + bb=b; + bm=bb+n*n; + fnsqi=0.0; + + for (;bb +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "global.h" + +#define MAX_LEVELS 12 +#define BLK_LENGTH 8 + +static int cnt[MAX_LEVELS]; +static double smx[MAX_LEVELS]; +static double c1=0.0,c2,c3,rb[8],sm; + + +void random_alg(int vol,su3_alg_dble *X) +{ + su3_alg_dble *Xm; + + if (c1==0.0) + { + c1=(sqrt(3.0)+1.0)/6.0; + c2=(sqrt(3.0)-1.0)/6.0; + c3=1.0/sqrt(2.0); + } + + Xm=X+vol; + + for (;X=BLK_LENGTH)&&(n1)) + { + sm=smx[0]; + MPI_Reduce(&sm,smx,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(smx,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return 4.0*smx[0]; +} + + +double scalar_prod_alg(int vol,int icom,su3_alg_dble *X,su3_alg_dble *Y) +{ + int n; + su3_alg_dble *Xm; + + for (n=0;n=BLK_LENGTH)&&(n1)) + { + sm=smx[0]; + MPI_Reduce(&sm,smx,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(smx,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return smx[0]; +} + + +void set_alg2zero(int vol,su3_alg_dble *X) +{ + su3_alg_dble *Xm; + + Xm=X+vol; + + for (;X +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "sflds.h" +#include "linalg.h" +#include "global.h" + +static int nrot=0,ifail=0; +static spinor *psi; + + +static void alloc_wrotate(int n) +{ + if (nrot>0) + afree(psi); + + psi=amalloc(n*sizeof(*psi),ALIGN); + + if (psi==NULL) + { + error_loc(1,1,"alloc_wrotate [salg.c]", + "Unable to allocate workspace"); + nrot=0; + ifail=1; + } + else + { + nrot=n; + set_s2zero(n,psi); + } +} + +#if (defined AVX) +#include "avx.h" + +complex spinor_prod(int vol,int icom,spinor *s,spinor *r) +{ + complex z; + complex_dble v,w; + spinor *sm; + + __asm__ __volatile__ ("vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t" + "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" + "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" + "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" + "vxorps %%ymm13, %%ymm13, %%ymm13 \n\t" + "vxorps %%ymm14, %%ymm14, %%ymm14" + : + : + : + "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14"); + + sm=s+vol; + + for (;s1)) + { + mpc_gsum_d(&v.re,&w.re,2); + + z.re=(float)(w.re); + z.im=(float)(w.im); + } + else + { + z.re=(float)(v.re); + z.im=(float)(v.im); + } + + return z; +} + + +float spinor_prod_re(int vol,int icom,spinor *s,spinor *r) +{ + double x,y; + spinor *sm; + + __asm__ __volatile__ ("vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t" + "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" + "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" + : + : + : + "xmm9", "xmm10", "xmm11"); + + sm=s+vol; + + for (;s1)) + { + mpc_gsum_d(&x,&y,1); + return (float)(y); + } + else + return (float)(x); +} + + +float norm_square(int vol,int icom,spinor *s) +{ + double x,y; + spinor *sm; + + __asm__ __volatile__ ("vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t" + "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" + "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" + : + : + : + "xmm9", "xmm10", "xmm11"); + + sm=s+vol; + + for (;s1)) + { + mpc_gsum_d(&x,&y,1); + return (float)(y); + } + else + return (float)(x); +} + + +void mulc_spinor_add(int vol,spinor *s,spinor *r,complex z) +{ + spinor *sm; + + _avx_load_cmplx_up(z); + sm=s+vol; + + for (;snrot)&&(ifail==0)) + alloc_wrotate(n); + + if ((n>0)&&(ifail==0)) + { + for (ix=0;ix1)) + { + v.re=x; + v.im=-y; + + mpc_gsum_d(&v.re,&w.re,2); + + z.re=(float)(w.re); + z.im=(float)(w.im); + } + else + { + z.re=(float)(x); + z.im=(float)(-y); + } + + return z; +} + + +float spinor_prod_re(int vol,int icom,spinor *s,spinor *r) +{ + double x,y; + spinor *sm; + + __asm__ __volatile__ ("xorpd %%xmm10, %%xmm10 \n\t" + "xorpd %%xmm11, %%xmm11 \n\t" + "xorpd %%xmm12, %%xmm12" + : + : + : + "xmm10", "xmm11", "xmm12"); + + sm=s+vol; + + for (;s1)) + { + mpc_gsum_d(&x,&y,1); + return (float)(y); + } + else + return (float)(x); +} + + +float norm_square(int vol,int icom,spinor *s) +{ + double x,y; + spinor *sm; + + __asm__ __volatile__ ("xorpd %%xmm10, %%xmm10 \n\t" + "xorpd %%xmm11, %%xmm11 \n\t" + "xorpd %%xmm12, %%xmm12" + : + : + : + "xmm10", "xmm11", "xmm12"); + + sm=s+vol; + + for (;s1)) + { + mpc_gsum_d(&x,&y,1); + return (float)(y); + } + else + return (float)(x); +} + + +void mulc_spinor_add(int vol,spinor *s,spinor *r,complex z) +{ + spinor *sm; + + _sse_load_cmplx(z); + sm=s+vol; + + for (;snrot)&&(ifail==0)) + alloc_wrotate(n); + + if ((n>0)&&(ifail==0)) + { + for (ix=0;ixnrot)&&(ifail==0)) + alloc_wrotate(n); + + if ((n>0)&&(ifail==0)) + { + for (ix=0;ixnrot)&&(ifail==0)) + alloc_wrotate(n); + + if ((n>0)&&(ifail==0)) + { + for (ix=0;ix +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "sflds.h" +#include "linalg.h" +#include "global.h" + +#define MAX_LEVELS 12 +#define BLK_LENGTH 8 + +static int nrot=0,ifail=0; +static int cnt[MAX_LEVELS]; +static double smx[MAX_LEVELS] ALIGNED16; +#if (defined QPX) +static double smy[MAX_LEVELS] ALIGNED16; +#endif +static complex_dble smz[MAX_LEVELS] ALIGNED16; +static spinor_dble *psi; + + +static void alloc_wrotate(int n) +{ + if (nrot>0) + afree(psi); + + psi=amalloc(n*sizeof(*psi),ALIGN); + + if (psi==NULL) + { + error_loc(1,1,"alloc_wrotate [salg_dble.c]", + "Unable to allocate workspace"); + nrot=0; + ifail=1; + } + else + { + nrot=n; + set_sd2zero(n,psi); + } +} + +#if (defined AVX) +#include "avx.h" + +complex_dble spinor_prod_dble(int vol,int icom,spinor_dble *s,spinor_dble *r) +{ + int n; + complex_dble w,z; + spinor_dble *sm,*smb; + + for (n=0;nsm) + smb=sm; + + __asm__ __volatile__ ("vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5, %%ymm5, %%ymm5" + : + : + : + "xmm0", "xmm1", "xmm2", + "xmm3", "xmm4", "xmm5"); + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&v.re,&w.re,2); + return z; + } + else + return w; +} + + +double spinor_prod_re_dble(int vol,int icom,spinor_dble *s,spinor_dble *r) +{ + int n; + double x,y; + spinor_dble *sm,*smb; + + for (n=0;nsm) + smb=sm; + + __asm__ __volatile__ ("vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" + : + : + : + "xmm0", "xmm1", "xmm2"); + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&x,&y,1); + return y; + } + else + return x; +} + + +complex_dble spinor_prod5_dble(int vol,int icom,spinor_dble *s,spinor_dble *r) +{ + int n; + complex_dble w,z; + spinor_dble *sm,*smb; + + for (n=0;nsm) + smb=sm; + + __asm__ __volatile__ ("vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5, %%ymm5, %%ymm5" + : + : + : + "xmm0", "xmm1", "xmm2", + "xmm3", "xmm4", "xmm5"); + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&w.re,&z.re,2); + return z; + } + else + return w; +} + + +double norm_square_dble(int vol,int icom,spinor_dble *s) +{ + int n; + double x,y; + spinor_dble *sm,*smb; + + for (n=0;nsm) + smb=sm; + + __asm__ __volatile__ ("vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" + "vxorpd %%ymm8, %%ymm8, %%ymm8 \n\t" + "vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t" + "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" + "vxorpd %%ymm11, %%ymm11, %%ymm11" + : + : + : + "xmm6", "xmm7", "xmm8", + "xmm9", "xmm10", "xmm11"); + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&y,&x,1); + return x; + } + else + return y; +} + + +void mulc_spinor_add_dble(int vol,spinor_dble *s,spinor_dble *r, + complex_dble z) +{ + spinor_dble *sm; + + _avx_load_cmplx_up_dble(z); + sm=s+vol; + + for (;snrot)&&(ifail==0)) + alloc_wrotate(n); + + if ((n>0)&&(ifail==0)) + { + for (ix=0;ixsm) + smb=sm; + + __asm__ __volatile__ ("xorpd %%xmm6, %%xmm6 \n\t" + "xorpd %%xmm7, %%xmm7 \n\t" + "xorpd %%xmm8, %%xmm8 \n\t" + "xorpd %%xmm9, %%xmm9 \n\t" + "xorpd %%xmm10, %%xmm10 \n\t" + "xorpd %%xmm11, %%xmm11" + : + : + : + "xmm6", "xmm7", "xmm8", + "xmm9", "xmm10", "xmm11"); + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&w.re,&z.re,2); + return z; + } + else + return w; +} + + +double spinor_prod_re_dble(int vol,int icom,spinor_dble *s,spinor_dble *r) +{ + int n; + double x,y; + spinor_dble *sm,*smb; + + for (n=0;nsm) + smb=sm; + + __asm__ __volatile__ ("xorpd %%xmm6, %%xmm6 \n\t" + "xorpd %%xmm7, %%xmm7 \n\t" + "xorpd %%xmm8, %%xmm8" + : + : + : + "xmm6", "xmm7", "xmm8"); + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&y,&x,1); + return x; + } + else + return y; +} + + +complex_dble spinor_prod5_dble(int vol,int icom,spinor_dble *s,spinor_dble *r) +{ + int n; + complex_dble w,z; + spinor_dble *sm,*smb; + + for (n=0;nsm) + smb=sm; + + __asm__ __volatile__ ("xorpd %%xmm6, %%xmm6 \n\t" + "xorpd %%xmm7, %%xmm7 \n\t" + "xorpd %%xmm8, %%xmm8 \n\t" + "xorpd %%xmm9, %%xmm9 \n\t" + "xorpd %%xmm10, %%xmm10 \n\t" + "xorpd %%xmm11, %%xmm11" + : + : + : + "xmm6", "xmm7", "xmm8", + "xmm9", "xmm10", "xmm11"); + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&w.re,&z.re,2); + return z; + } + else + return w; +} + + +double norm_square_dble(int vol,int icom,spinor_dble *s) +{ + int n; + double x,y; + spinor_dble *sm,*smb; + + for (n=0;nsm) + smb=sm; + + __asm__ __volatile__ ("xorpd %%xmm6, %%xmm6 \n\t" + "xorpd %%xmm7, %%xmm7 \n\t" + "xorpd %%xmm8, %%xmm8" + : + : + : + "xmm6", "xmm7", "xmm8"); + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&y,&x,1); + return x; + } + else + return y; +} + + +void mulc_spinor_add_dble(int vol,spinor_dble *s,spinor_dble *r, + complex_dble z) +{ + spinor_dble *sm; + + _sse_load_cmplx_dble(z); + sm=s+vol; + + for (;snrot)&&(ifail==0)) + alloc_wrotate(n); + + if ((n>0)&&(ifail==0)) + { + for (ix=0;ixsm) + smb=sm; + + x=0.0; + y=0.0; + + for (;s=BLK_LENGTH)&&(nsm) + smb=sm; + + x=0.0; + + for (;s=BLK_LENGTH)&&(nsm) + smb=sm; + + x=0.0; + y=0.0; + + for (;s=BLK_LENGTH)&&(nsm) + smb=sm; + + x=0.0; + + for (;s=BLK_LENGTH)&&(nnrot)&&(ifail==0)) + alloc_wrotate(n); + + if ((n>0)&&(ifail==0)) + { + for (ix=0;ixsm) + smb=sm; + + z.re=0.0; + z.im=0.0; + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&w.re,&z.re,2); + return z; + } + else + return w; +} + + +double spinor_prod_re_dble(int vol,int icom,spinor_dble *s,spinor_dble *r) +{ + int n; + double x,y; + spinor_dble *sm,*smb; + + for (n=0;nsm) + smb=sm; + + x=0.0; + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&y,&x,1); + return x; + } + else + return y; +} + + +complex_dble spinor_prod5_dble(int vol,int icom,spinor_dble *s,spinor_dble *r) +{ + int n; + complex_dble w,z; + spinor_dble *sm,*smb; + + for (n=0;nsm) + smb=sm; + + z.re=0.0; + z.im=0.0; + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&w.re,&z.re,2); + return z; + } + else + return w; +} + + +double norm_square_dble(int vol,int icom,spinor_dble *s) +{ + int n; + double x,y; + spinor_dble *sm,*smb; + + for (n=0;nsm) + smb=sm; + + x=0.0; + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&y,&x,2); + return x; + } + else + return y; +} + + +void mulc_spinor_add_dble(int vol,spinor_dble *s,spinor_dble *r, + complex_dble z) +{ + spinor_dble *sm; + + sm=s+vol; + + for (;snrot)&&(ifail==0)) + alloc_wrotate(n); + + if ((n>0)&&(ifail==0)) + { + for (ix=0;ix +#include +#include +#include "mpi.h" +#include "utils.h" +#include "linalg.h" +#include "global.h" + +static int nrot=0,ifail=0; +static complex *psi; + + +static void alloc_wrotate(int n) +{ + if (nrot>0) + afree(psi); + + psi=amalloc(n*sizeof(*psi),ALIGN); + + if (psi==NULL) + { + error_loc(1,1,"alloc_wrotate [valg.c]","Unable to allocate workspace"); + nrot=0; + ifail=1; + } + else + nrot=n; +} + + +complex vprod(int n,int icom,complex *v,complex *w) +{ + complex z,*vm; + complex_dble vd,wd; + + vd.re=0.0; + vd.im=0.0; + vm=v+n; + + for (;vnrot)&&(ifail==0)) + alloc_wrotate(nv); + + if ((nv>0)&&(ifail==0)) + { + for (i=0;i +#include +#include +#include "mpi.h" +#include "utils.h" +#include "linalg.h" +#include "global.h" + +#define MAX_LEVELS 8 +#define BLK_LENGTH 32 + +static int nrot=0,ifail=0; +static int cnt[MAX_LEVELS]; +static double smx[MAX_LEVELS],smy[MAX_LEVELS]; +static complex_dble *psi; + + +static void alloc_wrotate(int n) +{ + if (nrot>0) + afree(psi); + + psi=amalloc(n*sizeof(*psi),ALIGN); + + if (psi==NULL) + { + error_loc(1,1,"alloc_wrotate [valg_dble.c]", + "Unable to allocate workspace"); + nrot=0; + ifail=1; + } + else + nrot=n; +} + + +complex_dble vprod_dble(int n,int icom,complex_dble *v,complex_dble *w) +{ + int k; + complex_dble s,t; + complex_dble *vm,*vb; + + for (k=0;kvm) + vb=vm; + s.re=0.0; + s.im=0.0; + + for (;v=BLK_LENGTH)&&(kvm) + vb=vm; + s=0.0; + + for (;v=BLK_LENGTH)&&(knrot)&&(ifail==0)) + alloc_wrotate(nv); + + if ((nv>0)&&(ifail==0)) + { + for (i=0;i +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "sflds.h" +#include "linalg.h" +#include "linsolv.h" +#include "global.h" + +#define PRECISION_LIMIT ((double)(100.0f*FLT_EPSILON)) + +static float rsq,rsq_old,ai,bi; +static spinor *psx,*psr,*psp,*psap,*psw; +static spinor_dble *pdb,*pdx,*pdw,*pdv; + +#if (defined x64) +#include "sse2.h" + +static void update_g(int vol) +{ + float c; + spinor *r,*s,*sm; + + c=-ai; + + __asm__ __volatile__ ("movss %0, %%xmm6 \n\t" + "shufps $0x0, %%xmm6, %%xmm6 \n\t" + "movaps %%xmm6, %%xmm7 \n\t" + "movaps %%xmm6, %%xmm8" + : + : + "m" (c) + : + "xmm6", "xmm7", "xmm8"); + + r=psr; + s=psap; + sm=s+vol; + + for (;s1)) + { + iprms[0]=vol; + iprms[1]=nmx; + dprms[0]=res; + + MPI_Bcast(iprms,2,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=vol)||(iprms[1]!=nmx)||(dprms[0]!=res),1, + "cgne [cgne.c]","Parameters are not global"); + + error_root((vol<=0)||(nmx<1)||(res<=DBL_EPSILON),1, + "cgne [cgne.c]","Improper choice of vol,nmx or res"); + } + else + { + if ((vol<=0)||(nmx<1)||(res<=DBL_EPSILON)) + { + error_loc(1,1,"cgne [cgne.c]", + "Improper choice of vol,nmx or res"); + (*status)=0; + return 1.0; + } + } + + cg_init(vol,icom,ws,wsd,eta,psi); + rn=sqrt((double)(rsq)); + tol=res*rn; + (*status)=0; + + xn=(double)(norm_square(vol,icom,psx)); + xn=sqrt(xn); + + while (rn>tol) + { +#ifdef CGNE_DBG + message("[cgne]: rn_old = %.2e\n",rn); +#endif + ncg=0; + + for (;;) + { + cg_step(vol,icom,Dop); + ncg+=1; + (*status)+=1; + + xn=(double)(norm_square(vol,icom,psx)); + xn=sqrt(xn); + rn=sqrt((double)(rsq)); +#ifdef CGNE_DBG + message("[cgne]: ncg = %d, xn = %.2e, rn = %.2e\n",(*status),xn,rn); +#endif + if ((rn<=tol)||(rn<=(PRECISION_LIMIT*xn))||(ncg>=100)|| + ((*status)>=nmx)) + break; + } + + add_s2sd(vol,psx,pdx); + xn=norm_square_dble(vol,icom,pdx); + xn=sqrt(xn); + cg_reset(vol,icom,Dop,Dop_dble); + rn=sqrt((double)(rsq)); + + if (((*status)>=nmx)&&(rn>tol)) + { + (*status)=-1; + break; + } + + if ((100.0*DBL_EPSILON*xn)>tol) + { + (*status)=-2; + break; + } + } + + return rn; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linsolv/fgcr.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linsolv/fgcr.c new file mode 100644 index 0000000000000000000000000000000000000000..3f50ce946a0d35191ea5c105f19d784530df45b6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linsolv/fgcr.c @@ -0,0 +1,300 @@ + +/******************************************************************************* +* +* File fgcr.c +* +* Copyright (C) 2005, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Generic flexible GCR solver program for the lattice Dirac equation. +* +* The externally accessible function is +* +* double fgcr(int vol,int icom, +* void (*Dop)(spinor_dble *s,spinor_dble *r), +* void (*Mop)(int k,spinor *rho,spinor *phi,spinor *chi), +* spinor **ws,spinor_dble **wsd,int nkv,int nmx,double res, +* spinor_dble *eta,spinor_dble *psi,int *status) +* Solution of the Dirac equation D*psi=eta for given source eta, using +* the preconditioned GCR algorithm. See the notes for the explanation +* of the parameters of the program. +* +* Notes: +* +* This program uses single-precision arithmetic to reduce the execution +* time, but obtains the solution with double-precision accuracy. +* +* The programs Dop() and Mop() for the operator D and the preconditioner M +* are assumed to have the following properties: +* +* void Dop(spinor_dble *s,spinor_dble *r) +* Application of the operator D to the Dirac field s and assignment of +* the result to r. On exit s may be changed but must satisfy D*s=r. +* +* void Mop(int k,spinor *rho,spinor *phi,spinor *chi) +* Approximate solution of the equation D*phi=rho in the k'th step of +* the GCR algorithm. On exit rho is unchanged and chi=D*phi. +* +* Mop() is not required to be a linear operator and may involve an iterative +* procedure with a dynamical stopping criterion, for example. The field phi +* merely defines the next search direction and can in principle be chosen +* arbitrarily. +* +* The other parameters of the program fgcr() are: +* +* vol Number of spinors in the Dirac fields. +* +* icom Indicates whether the equation to be solved is a local +* equation (icom=0) or a global one (icom=1). Scalar products +* are summed over all MPI processes if icom=1, while no +* communications are performed if icom=0. +* +* nkv Maximal number of Krylov vectors generated before the GCR +* algorithm is restarted. +* +* nmx Maximal total number of Krylov vectors that may be generated. +* +* res Desired maximal relative residue |eta-D*psi|/|eta| of the +* calculated solution. +* +* ws Array of at least 2*nkv+1 single-precision spinor fields +* (used as work space). +* +* wsd Array of at least 1 double-precision spinor field (used +* as work space). +* +* eta Source field (unchanged on exit). +* +* psi Calculated approximate solution of the Dirac equation +* D*psi=eta. +* +* status On exit, this parameter reports the total number of Krylov +* vectors that were generated, or a negative value if the +* program failed. +* +* Independently of whether the program succeeds in solving the Dirac equation +* to the desired accuracy, the program returns the norm of the residue of +* the field psi. +* +* Some debugging output is printed to stdout on process 0 if FGCR_DBG is +* defined at compilation time. +* +*******************************************************************************/ + +#define FGCR_C + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "sflds.h" +#include "linalg.h" +#include "linsolv.h" +#include "global.h" + +#define PRECISION_LIMIT ((double)(100.0f*FLT_EPSILON)) + +static int nkm=0; +static float *b; +static complex *a,*c; +static double rn; +static spinor **phi,**chi,*rho; +static spinor_dble *wrk; + + +static int alloc_arrays(int nkv) +{ + if (nkm>0) + { + afree(a); + afree(b); + } + + a=amalloc(nkv*(nkv+1)*sizeof(*a),ALIGN); + b=amalloc(nkv*sizeof(*b),ALIGN); + + if ((a==NULL)||(b==NULL)) + return 1; + + c=a+nkv*nkv; + nkm=nkv; + + return 0; +} + + +static void gcr_init(int vol,int icom,int nkv,spinor **ws,spinor_dble **wsd, + spinor_dble *eta,spinor_dble *psi) +{ + phi=ws; + rho=ws[nkv]; + chi=ws+nkv+1; + wrk=wsd[0]; + + set_sd2zero(vol,psi); + assign_sd2s(vol,eta,rho); + + rn=(double)(norm_square(vol,icom,rho)); + rn=sqrt(rn); +} + + +static void gcr_step(int vol,int icom,int k,int nkv, + void (*Mop)(int k,spinor *rho,spinor *phi,spinor *chi)) +{ + int l; + complex z; + + (*Mop)(k,rho,phi[k],chi[k]); + + for (l=0;l=0;l--) + { + z.re=c[l].re; + z.im=c[l].im; + + for (i=(l+1);i<=k;i++) + { + z.re-=(a[l*nkv+i].re*c[i].re-a[l*nkv+i].im*c[i].im); + z.im-=(a[l*nkv+i].re*c[i].im+a[l*nkv+i].im*c[i].re); + } + + r=1.0f/b[l]; + c[l].re=z.re*r; + c[l].im=z.im*r; + } + + set_s2zero(vol,rho); + + for (l=k;l>=0;l--) + mulc_spinor_add(vol,rho,phi[l],c[l]); + + add_s2sd(vol,rho,psi); + (*Dop)(psi,wrk); + diff_sd2s(vol,eta,wrk,rho); + + rn=(double)(norm_square(vol,icom,rho)); + rn=sqrt(rn); +} + + +double fgcr(int vol,int icom, + void (*Dop)(spinor_dble *s,spinor_dble *r), + void (*Mop)(int k,spinor *eta,spinor *psi,spinor *chi), + spinor **ws,spinor_dble **wsd,int nkv,int nmx,double res, + spinor_dble *eta,spinor_dble *psi,int *status) +{ + int ie,k,iprms[3]; + double rn_old,tol,dprms[1]; + + if ((icom==1)&&(NPROC>1)) + { + iprms[0]=vol; + iprms[1]=nkv; + iprms[2]=nmx; + dprms[0]=res; + + MPI_Bcast(iprms,3,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=vol)||(iprms[1]!=nkv)||(iprms[2]!=nmx)|| + (dprms[0]!=res),1,"fgcr [fgcr.c]","Parameters are not global"); + + error_root((vol<=0)||(nkv<1)||(nmx<1)||(res<=DBL_EPSILON),1, + "fgcr [fgcr.c]","Improper choice of vol,nkv,nmx or res"); + + if (nkv>nkm) + { + ie=alloc_arrays(nkv); + error(ie,1,"fgcr [fgcr.c]","Unable to allocate auxiliary arrays"); + } + } + else + { + if ((vol<=0)||(nkv<1)||(nmx<1)||(res<=DBL_EPSILON)) + { + error_loc(1,1,"fgcr [fgcr.c]", + "Improper choice of vol,nkv,nmx or res"); + (*status)=0; + return 1.0; + } + + if (nkv>nkm) + { + ie=alloc_arrays(nkv); + + if (ie) + { + error_loc(1,1,"fgcr [fgcr.c]", + "Unable to allocate auxiliary arrays"); + (*status)=0; + return 1.0; + } + } + } + + gcr_init(vol,icom,nkv,ws,wsd,eta,psi); + tol=res*rn; + (*status)=0; + + while (rn>tol) + { +#ifdef FGCR_DBG + message("[fgcr]: rn_old = %.2e\n",rn); +#endif + rn_old=rn; + + for (k=0;;k++) + { + gcr_step(vol,icom,k,nkv,Mop); + (*status)+=1; +#ifdef FGCR_DBG + message("[fgcr]: k = %d, rn = %.2e\n",k,rn); +#endif + if ((rn<=tol)||(rn<(PRECISION_LIMIT*rn_old))|| + ((k+1)==nkv)||((*status)==nmx)) + break; + } + + update_psi(vol,icom,k,nkv,eta,psi,Dop); + + if (((*status)==nmx)&&(rn>tol)) + { + (*status)=-1; + return rn; + } + } + + return rn; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linsolv/fgcr4vd.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linsolv/fgcr4vd.c new file mode 100644 index 0000000000000000000000000000000000000000..fd5cf8dfc81eacbf0c0234d9ddcd6dd761270416 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linsolv/fgcr4vd.c @@ -0,0 +1,354 @@ + +/******************************************************************************* +* +* File fgcr4vd.c +* +* Copyright (C) 2007, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Generic flexible GCR solver program for the little Dirac equation. +* +* The externally accessible function is +* +* double fgcr4vd(int vol,int icom, +* void (*Dop)(complex_dble *v,complex_dble *w), +* void (*Mop)(int k,complex *rho,complex *phi,complex *chi), +* complex *wv[],complex_dble *wvd[],int nkv,int nmx,double res, +* complex_dble *eta,complex_dble *psi,int *status) +* Solution of the little equation D*psi=eta for given source eta, using +* the preconditioned GCR algorithm. See the notes for the explanation +* of the parameters of the program. +* +* Notes: +* +* This program uses single-precision arithmetic to reduce the execution +* time, but obtains the solution with double-precision accuracy. +* +* The programs Dop() and Mop() for the operator D and the preconditioner M +* are assumed to have the following properties: +* +* void Dop(complex_dble *v,complex_dble *w) +* Application of the operator D to the complex field v and assignment +* of the result to w. On exit v may be changed but must satisfy D*v=w. +* +* void Mop(int k,complex *rho,complex *phi,complex *chi) +* Approximate solution of the equation D*phi=rho in the k'th step of +* the GCR algorithm. On exit rho is unchanged and chi=D*phi. +* +* Mop() is not required to be a linear operator and may involve an iterative +* procedure with a dynamical stopping criterion, for example. The field phi +* merely defines the next search direction and can in principle be chosen +* arbitrarily. +* +* The other parameters of the program fgcr4vd() are: +* +* vol Number of complex components of the fields on which the +* operator D acts. +* +* icom Indicates whether the equation to be solved is a local +* equation (icom=0) or a global one (icom=1). Scalar products +* are summed over all MPI processes if icom=1, while no +* communications are performed if icom=0. +* +* nkv Maximal number of Krylov vectors generated before the GCR +* algorithm is restarted. +* +* nmx Maximal total number of Krylov vectors that may be generated. +* +* res Desired maximal relative residue |eta-D*psi|/|eta| of the +* calculated solution. +* +* wv Array of at least 2*nkv+1 single-precision complex fields +* (used as work space). +* +* wvd Array of at least 1 double-precision complex field (used +* as work space). +* +* eta Source field (unchanged on exit). +* +* psi Calculated approximate solution of the little equation +* D*psi=eta. +* +* status On exit, this parameter reports the total number of Krylov +* vectors that were generated or -1 if the algorithm did not +* converge. +* +* Independently of whether the program succeeds in solving the little equation +* to the desired accuracy, the program returns the norm of the residue of +* the field psi. +* +* Some debugging output is printed to stdout on process 0 if FGCR4VD_DBG is +* defined at compilation time. +* +*******************************************************************************/ + +#define FGCR4VD_C + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "vflds.h" +#include "linalg.h" +#include "linsolv.h" +#include "global.h" + +#define PRECISION_LIMIT ((double)(100.0f*FLT_EPSILON)) + +static int nkm=0; +static float *b; +static complex *a,*c; +static double rn; +static complex **phi,**chi,*rho; +static complex_dble *wrk,*cs1,*cs2; + + +static int alloc_arrays(int nkv) +{ + if (nkm>0) + { + afree(a); + afree(b); + afree(cs1); + } + + a=amalloc(nkv*(nkv+1)*sizeof(*a),ALIGN); + b=amalloc(nkv*sizeof(*b),ALIGN); + cs1=amalloc(2*(nkv+2)*sizeof(*cs1),ALIGN); + + if ((a==NULL)||(b==NULL)||(cs1==NULL)) + return 1; + + c=a+nkv*nkv; + cs2=cs1+nkv+2; + nkm=nkv; + + return 0; +} + + +static void gcr_init(int vol,int icom,int nkv,complex **wv,complex_dble **wvd, + complex_dble *eta,complex_dble *psi) +{ + phi=wv; + rho=wv[nkv]; + chi=wv+nkv+1; + wrk=wvd[0]; + + set_vd2zero(vol,psi); + assign_vd2v(vol,eta,rho); + + rn=(double)(vnorm_square(vol,icom,rho)); + rn=sqrt(rn); +} + + +static void sum_vprod(int icom,int n) +{ + int i; + + if ((icom==1)&&(NPROC>1)) + { + MPI_Reduce((double*)(cs1),(double*)(cs2),2*n,MPI_DOUBLE, + MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast((double*)(cs2),2*n,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + { + for (i=0;i=0;l--) + { + z.re=c[l].re; + z.im=c[l].im; + + for (i=(l+1);i<=k;i++) + { + z.re-=(a[l*nkv+i].re*c[i].re-a[l*nkv+i].im*c[i].im); + z.im-=(a[l*nkv+i].re*c[i].im+a[l*nkv+i].im*c[i].re); + } + + r=1.0f/b[l]; + c[l].re=z.re*r; + c[l].im=z.im*r; + } + + set_v2zero(vol,rho); + + for (l=k;l>=0;l--) + mulc_vadd(vol,rho,phi[l],c[l]); + + add_v2vd(vol,rho,psi); + (*Dop)(psi,wrk); + diff_vd2v(vol,eta,wrk,rho); + + rn=(double)(vnorm_square(vol,icom,rho)); + rn=sqrt(rn); +} + + +double fgcr4vd(int vol,int icom, + void (*Dop)(complex_dble *v,complex_dble *w), + void (*Mop)(int k,complex *eta,complex *psi,complex *chi), + complex **wv,complex_dble **wvd,int nkv,int nmx,double res, + complex_dble *eta,complex_dble *psi,int *status) +{ + int ie,k,iprms[3]; + double rn_old,tol,dprms[1]; + + if ((icom==1)&&(NPROC>1)) + { + iprms[0]=vol; + iprms[1]=nkv; + iprms[2]=nmx; + dprms[0]=res; + + MPI_Bcast(iprms,3,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=vol)||(iprms[1]!=nkv)||(iprms[2]!=nmx)|| + (dprms[0]!=res),1,"fgcr4vd [fgcr4vd.c]", + "Parameters are not global"); + + error_root((vol<=0)||(nkv<1)||(nmx<1)||(res<=DBL_EPSILON),1, + "fgcr4vd [fgcr4vd.c]", + "Improper choice of vol,nkv,nmx or res"); + + if (nkv>nkm) + { + ie=alloc_arrays(nkv); + error(ie,1,"fgcr4vd [fgcr4vd.c]", + "Unable to allocate auxiliary arrays"); + } + } + else + { + if ((vol<=0)||(nkv<1)||(nmx<1)||(res<=DBL_EPSILON)) + { + error_loc(1,1,"fgcr4vd [fgcrvvd.c]", + "Improper choice of vol,nkv,nmx or res"); + (*status)=0; + return 1.0; + } + + if (nkv>nkm) + { + ie=alloc_arrays(nkv); + + if (ie) + { + error_loc(1,1,"fgcr4vd [fgcr4vd.c]", + "Unable to allocate auxiliary arrays"); + (*status)=0; + return 1.0; + } + } + } + + gcr_init(vol,icom,nkv,wv,wvd,eta,psi); + tol=res*rn; + (*status)=0; + + while (rn>tol) + { +#ifdef FGCR4VD_DBG + message("[fgcr4vd]: rn_old = %.2e\n",rn); +#endif + rn_old=rn; + + for (k=0;;k++) + { + gcr_step(vol,icom,k,nkv,Mop); + (*status)+=1; +#ifdef FGCR4VD_DBG + message("[fgcr4vd]: k = %d, rn = %.2e\n",k,rn); +#endif + if ((rn<=tol)||(rn<(PRECISION_LIMIT*rn_old))|| + ((k+1)==nkv)||((*status)==nmx)) + break; + } + + update_psi(vol,icom,k,nkv,eta,psi,Dop); + + if (((*status)==nmx)&&(rn>tol)) + { + (*status)=-1; + return rn; + } + } + + return rn; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linsolv/mscg.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linsolv/mscg.c new file mode 100644 index 0000000000000000000000000000000000000000..e0060df7c57456d9a285778f9dd76f660f32de1d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linsolv/mscg.c @@ -0,0 +1,526 @@ + +/******************************************************************************* +* +* File mscg.c +* +* Copyright (C) 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Generic multi-shift CG solver program for the lattice Dirac equation +* +* The externally accessible function is +* +* void mscg(int vol,int icom,int nmu,double *mu, +* void (*Dop_dble)(double mu,spinor_dble *s,spinor_dble *r), +* spinor_dble **wsd,int nmx,double *res, +* spinor_dble *eta,spinor_dble **psi,int *status) +* Solution of the Dirac equation (D^dag*D+mu^2)*psi=eta for a given +* source eta and one or more values of mu using the multi-shift CG +* algorithm. See the notes for the explanation of the parameters of +* the program. +* +* Notes: +* +* The algorithm implemented in this module is described in the notes +* "Multi-shift conjugate gradient algorithm" (file doc/mscg.pdf). +* +* The program Dop_dble() for the Dirac operator is assumed to have the +* following properties: +* +* void Dop_dble(double mu,spinor_dble *s,spinor_dble *r) +* Application of an operator Op or its hermitian conjugate Op^dag +* to the double-precision Dirac field s and assignment of the result +* to r (where r is different from s). The operator must be such that +* the identity Op^dag*Op=D^dag*D+mu^2 holds. Op and Op^dag are applied +* alternatingly, i.e. the first call of the program applies Op, the +* next call Op^dag, then Op again and so on. In all cases, the source +* field s remains unchanged. +* +* The other parameters of the program mscg() are: +* +* vol Number of spinors in the Dirac fields. +* +* icom Indicates whether the equation to be solved is a local +* equation (icom=0) or a global one (icom=1). Scalar products +* are summed over all MPI processes if icom=1, while no +* communications are performed if icom=0. +* +* nmu Number of shifts mu. +* +* mu Array of the shifts mu (nmu elements). +* +* nmx Maximal number of CG iterations that may be applied. +* +* res Array of the desired maximal relative residues of the +* calculated solutions (nmu elements). +* +* wsd Array of at least 3+nmu (5 if nmu=1) double-precision spinor +* fields (used as work space). +* +* eta Source field (unchanged on exit). +* +* psi Array of the calculated approximate solutions of the Dirac +* equations (D^dag*D+mu^2)*psi=eta (nmu elements). +* +* status On exit, this parameter reports the number of CG iterations +* that were required, or a negative value if the program failed. +* +* The spinor fields must have at least vol elements and must be such that +* the program Dop_dble() acts correctly on them. Some debugging output is +* printed to stdout on process 0 if the macro MSCG_DBG is defined. +* +*******************************************************************************/ + +#define MSCG_C + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "sflds.h" +#include "linalg.h" +#include "linsolv.h" +#include "global.h" + +typedef struct +{ + int stop; + double s,tol; + double ah,bh,gh,rh; + spinor_dble *xh,*ph; +} cgsh_t; + +typedef struct +{ + int k,stop; + double mu,tol; + double a,b; + double rn0,rn,rnsq; + spinor_dble *x,*r,*p,*ap,*w; +} cgs_t; + +static int ns=0; +static double *dprms; +static cgs_t cgs; +static cgsh_t *cgsh; + + +static int alloc_cgs(int nmu,double *mu,double *res,spinor_dble **wsd, + spinor_dble **psi) +{ + int k,l,k0; + + if (nmu>ns) + { + if (ns>0) + free(dprms); + if (ns>1) + free(cgsh); + + dprms=malloc(2*nmu*sizeof(*dprms)); + if (dprms==NULL) + return 1; + + if (nmu>1) + { + cgsh=malloc((nmu-1)*sizeof(*cgsh)); + if (cgsh==NULL) + return 1; + } + + ns=nmu; + } + + k0=0; + + for (k=1;k=nmx) + return 1; + + x=wsd[2]; + p=wsd[3]; + ap=wsd[4]; + + set_sd2zero(vol,x); + assign_sd2sd(vol,r,p); + + while ((rn>tol)&&((*ncg)1)) + { + iprms[0]=vol; + iprms[1]=nmu; + iprms[2]=nmx; + + MPI_Bcast(iprms,3,MPI_INT,0,MPI_COMM_WORLD); + error((iprms[0]!=vol)||(iprms[1]!=nmu)||(iprms[2]!=nmx),1, + "mscg [mscg.c]","Integer parameters are not global"); + error_root((vol<1)||(nmu<1)||(nmx<1),1,"mscg [mscg.c]", + "Improper choice of vol,nmu or nmx"); + + ie=alloc_cgs(nmu,mu,res,wsd,psi); + error(ie!=0,1,"mscg [mscg.c]","Unable to allocate auxiliary arrays"); + + for (k=0;k +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "vflds.h" +#include "linalg.h" +#include "dfl.h" +#include "little.h" +#include "global.h" + +static int Ns=0,nb,nbh; +static int nbbh,(*inn)[8]; +static complex *vs; + + +static void alloc_vs(void) +{ + dfl_parms_t dfl; + dfl_grid_t grd; + + dfl=dfl_parms(); + grd=dfl_geometry(); + + Ns=dfl.Ns; + nb=grd.nb; + nbh=nb/2; + nbbh=grd.nbb/2; + inn=grd.inn; + + vs=amalloc(Ns*sizeof(*vs),ALIGN); + + error(vs==NULL,1,"alloc_vs [Aw.c]", + "Unable to allocate auxiliary array"); +} + + +static void apply_Aoe(int *nn,complex **A,complex *v) +{ + int ifc; + + cmat_vec(Ns,*A,v+nn[0]*Ns,vs); + A+=1; + + for (ifc=1;ifc<8;ifc++) + { + cmat_vec_assign(Ns,*A,v+nn[ifc]*Ns,vs); + A+=1; + } +} + + +static void apply_Aeo(int *nn,complex **A,complex *v) +{ + int ifc; + + for (ifc=0;ifc<8;ifc++) + { + cmat_vec_assign(Ns,*A,vs,v+nn[ifc]*Ns); + A+=1; + } +} + + +static void apply_Aee(complex **A,complex *v,complex *w) +{ + complex **Am; + + Am=A+nbh; + + for (;A1) + { + set_v2zero(nbbh*Ns,w+nb*Ns); + cpv_int_bnd(v); + } + + Aoe=Aw.Aoe; + Aeo=Aw.Aeo; + rv=v+nbh*Ns; + rw=w+nbh*Ns; + + nn=inn+nbh; + nm=inn+nb; + + for (;nn1) + cpv_ext_bnd(w); +} + + +void Aweeinv(complex *v,complex *w) +{ + Aw_t Aw; + + if (Ns==0) + alloc_vs(); + + Aw=Awophat(); + apply_Aee(Aw.Aee,v,w); +} + + +void Awooinv(complex *v,complex *w) +{ + Aw_t Aw; + + if (Ns==0) + alloc_vs(); + + Aw=Awophat(); + apply_Aoo(Aw.Aoo,v,w); +} + + +void Awoe(complex *v,complex *w) +{ + int (*nn)[8],(*nm)[8]; + complex *rw,*rs,*rm; + complex **Aoe; + Aw_t Aw; + + if (Ns==0) + alloc_vs(); + + if (NPROC>1) + cpv_int_bnd(v); + + Aw=Awop(); + Aoe=Aw.Aoe; + rw=w+nbh*Ns; + + nn=inn+nbh; + nm=inn+nb; + + for (;nn1) + set_v2zero(nbbh*Ns,w+nb*Ns); + + Aw=Awop(); + Aeo=Aw.Aeo; + rv=v+nbh*Ns; + + nn=inn+nbh; + nm=inn+nb; + + for (;nn1) + cpv_ext_bnd(w); +} + + +void Awhat(complex *v,complex *w) +{ + int (*nn)[8],(*nm)[8]; + complex *rs,*rm; + complex **Aeo,**Aoe; + Aw_t Aw; + + if (Ns==0) + alloc_vs(); + + assign_v2v(nbh*Ns,v,w); + + if (NPROC>1) + { + set_v2zero(nbbh*Ns,w+nb*Ns); + cpv_int_bnd(v); + } + + Aw=Awophat(); + Aoe=Aw.Aoe; + Aeo=Aw.Aeo; + + nn=inn+nbh; + nm=inn+nb; + + for (;nn1) + cpv_ext_bnd(w); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/Aw_com.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/Aw_com.c new file mode 100644 index 0000000000000000000000000000000000000000..76b5d480c8408a56743b9bc898d1003e2799d8bb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/Aw_com.c @@ -0,0 +1,799 @@ + +/******************************************************************************* +* +* File Aw_com.c +* +* Copyright (C) 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Communication functions needed for the computation of the little Dirac +* operator. +* +* b2b_flds_t *b2b_flds(int n,int mu) +* Extracts the spinor fields on the interior boundaries of the n'th +* block of the DFL_BLOCKS grid and its neighbouring block in direction +* mu. The spinors on the odd sites are multiplied by the link variables +* in direction mu and -mu respectively. If the two blocks touch the +* boundary of the local lattice, the fields extracted from the even +* sites are copied to the neighbouring process. The program returns a +* structure containing the extracted field arrays (see README.Aw_com +* for detailed explanations). +* +* void cpAoe_ext_bnd(void) +* Copies the hopping terms Aoe and Aeo of the double-precision little +* Dirac operator on the odd exterior boundary points of the local block +* lattice to the neighbouring MPI processes and *adds* them to the hop- +* ping terms on the matching blocks on the target lattices. +* +* void cpAee_int_bnd(void) +* Copies the even-even terms Aee of the double-precision little Dirac +* operator on the (even) interior boundary points of the local block +* lattice to the neighbouring MPI processes. +* +* Notes: +* +* The program b2b_flds() writes the extracted spinor fields to internally +* allocated field arrays. These are reused when the program is called +* the next time. The data in the field arrays returned by b2b_flds() are +* therefore preserved only up to the next call of the program. +* +*******************************************************************************/ + +#define AW_COM_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "uflds.h" +#include "sflds.h" +#include "vflds.h" +#include "dfl.h" +#include "little.h" +#include "global.h" + +typedef struct +{ + int *iud[2]; + int *ise[2]; + int *iso[2]; + su3_dble *ud[2]; + spinor_dble **sd[2]; + spinor_dble **snd_buf[2]; + b2b_flds_t b2b; +} bsd_t; + +static int init_bsd=0,init_Aoe=0,init_Aee=0; +static int np,nmu[8]; +static int Ns=0,nb,nbb,nbh,nbbh; +static int nbbe[8],nbbo[8],obbe[8],obbo[8]; +static int (*inn)[8],*idx,*ipp,*mp; +static int nsnd,sfc[8]; + +static complex_dble **snd_buf_Aee[8]; +static complex_dble **rcv_buf_Aoe[8],**rcv_buf_Aeo[8]; +static bsd_t (*bsd)[4]; + +static MPI_Request snd_req_bsd[8],rcv_req_bsd[8]; +static MPI_Request snd_req_Aee[8],rcv_req_Aee[8]; +static MPI_Request snd_req_Aoe[8],rcv_req_Aoe[8]; +static MPI_Request snd_req_Aeo[8],rcv_req_Aeo[8]; + + +static void set_constants(void) +{ + int ifc; + dfl_parms_t dfl; + dfl_grid_t grd; + + dfl=dfl_parms(); + grd=dfl_geometry(); + + Ns=dfl.Ns; + nb=grd.nb; + nbb=grd.nbb; + nbh=nb/2; + nbbh=nbb/2; + + for (ifc=0;ifc<8;ifc++) + { + nbbe[ifc]=grd.nbbe[ifc]; + nbbo[ifc]=grd.nbbo[ifc]; + obbe[ifc]=grd.obbe[ifc]; + obbo[ifc]=grd.obbo[ifc]; + } + + inn=grd.inn; + idx=grd.idx; + ipp=grd.ipp; + mp=grd.map; + + np=(cpr[0]+cpr[1]+cpr[2]+cpr[3])&0x1; + nsnd=0; + + for (ifc=0;ifc<8;ifc++) + { + nmu[ifc]=cpr[ifc/2]&0x1; + + if (nbbe[ifc]+nbbo[ifc]) + { + sfc[nsnd]=ifc; + nsnd+=1; + } + } +} + + +static int fnd_nn(int n,int ifc) +{ + n=idx[n]; + n=inn[n][ifc]; + + if (n>=nb) + n=mp[n-nb]; + + return idx[n]; +} + + +static void set_snd_req_bsd(void) +{ + int ifc,vol,nbf; + int tag,saddr,raddr; + bsd_t *brd; + b2b_flds_t *b2b; + + for (ifc=0;ifc<8;ifc++) + { + brd=bsd[0]+(ifc/2); + b2b=&(*brd).b2b; + vol=(*b2b).vol; + + nbf=24*Ns*vol; + saddr=npr[ifc]; + raddr=npr[ifc^0x1]; + tag=mpi_permanent_tag(); + + MPI_Send_init((*brd).snd_buf[(ifc&0x1)^0x1][0],nbf, + MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD,&snd_req_bsd[ifc]); + MPI_Recv_init((*b2b).sde[(ifc&0x1)^0x1][0],nbf, + MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&rcv_req_bsd[ifc]); + } +} + + +static void alloc_bsd(void) +{ + int nbs,isw,vbb,vbm; + int n,m,mu,ifc,vol,*iud; + int ix,iy,k; + su3_dble *ud; + spinor_dble **psd,*sd; + block_t *b; + bndry_t *bb; + bsd_t *brd; + + if (Ns==0) + set_constants(); + + b=blk_list(DFL_BLOCKS,&nbs,&isw); + error(nbs==0,1,"alloc_bsd [Aw_com.c]", + "DFL_BLOCKS grid is not allocated"); + + bb=(*b).bb; + vbb=0; + vbm=0; + + for (mu=0;mu<4;mu++) + { + vol=bb[2*mu].vol; + vbb+=vol; + + if (vol>vbm) + vbm=vol; + } + + bsd=malloc(nb*sizeof(*bsd)); + iud=malloc(nb*vbb*sizeof(*iud)); + ud=amalloc(vbm*sizeof(*ud),ALIGN); + psd=malloc(24*Ns*sizeof(*psd)); + sd=amalloc(3*Ns*vbm*sizeof(*sd),ALIGN); + + error((bsd==NULL)||(iud==NULL)||(ud==NULL)||(psd==NULL)||(sd==NULL),1, + "alloc_bsd [Aw_com.c]","Unable to allocate buffers"); + + set_sd2zero(3*Ns*vbm,sd); + + for (n=0;n0) + send_bufs_Aee(sfc[m],eo); + + ifc=sfc[n]; + io=ifc^nmu[ifc]; + + get_mat(nbbo[io],ipp+obbo[io],Aw.Aee,snd_buf_Aee[io]); + + if (n>0) + { + wait_bufs_Aee(sfc[m],eo); + m+=eo; + eo^=0x1; + } + } + + while (m +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "vflds.h" +#include "linalg.h" +#include "dfl.h" +#include "little.h" +#include "global.h" + +static int Ns=0,nb,nbh; +static int nbbh,(*inn)[8]; +static complex_dble *vs; + + +static void alloc_vs(void) +{ + dfl_parms_t dfl; + dfl_grid_t grd; + + dfl=dfl_parms(); + grd=dfl_geometry(); + + Ns=dfl.Ns; + nb=grd.nb; + nbh=nb/2; + nbbh=grd.nbb/2; + inn=grd.inn; + + vs=amalloc(Ns*sizeof(*vs),ALIGN); + + error(vs==NULL,1,"alloc_vs [Aw_dble.c]", + "Unable to allocate auxiliary array"); +} + + +static void apply_Aoe(int *nn,complex_dble **A,complex_dble *v) +{ + int ifc; + + cmat_vec_dble(Ns,*A,v+nn[0]*Ns,vs); + A+=1; + + for (ifc=1;ifc<8;ifc++) + { + cmat_vec_assign_dble(Ns,*A,v+nn[ifc]*Ns,vs); + A+=1; + } +} + + +static void apply_Aeo(int *nn,complex_dble **A,complex_dble *v) +{ + int ifc; + + for (ifc=0;ifc<8;ifc++) + { + cmat_vec_assign_dble(Ns,*A,vs,v+nn[ifc]*Ns); + A+=1; + } +} + + +static void apply_Aee(complex_dble **A,complex_dble *v,complex_dble *w) +{ + complex_dble **Am; + + Am=A+nbh; + + for (;A1) + { + set_vd2zero(nbbh*Ns,w+nb*Ns); + cpvd_int_bnd(v); + } + + Aoe=Aw.Aoe; + Aeo=Aw.Aeo; + rv=v+nbh*Ns; + rw=w+nbh*Ns; + + nn=inn+nbh; + nm=inn+nb; + + for (;nn1) + cpvd_ext_bnd(w); +} + + +void Aweeinv_dble(complex_dble *v,complex_dble *w) +{ + Aw_dble_t Aw; + + if (Ns==0) + alloc_vs(); + + Aw=Awophat_dble(); + apply_Aee(Aw.Aee,v,w); +} + + +void Awooinv_dble(complex_dble *v,complex_dble *w) +{ + Aw_dble_t Aw; + + if (Ns==0) + alloc_vs(); + + Aw=Awophat_dble(); + apply_Aoo(Aw.Aoo,v,w); +} + + +void Awoe_dble(complex_dble *v,complex_dble *w) +{ + int (*nn)[8],(*nm)[8]; + complex_dble *rw,*rs,*rm; + complex_dble **Aoe; + Aw_dble_t Aw; + + if (Ns==0) + alloc_vs(); + + if (NPROC>1) + cpvd_int_bnd(v); + + Aw=Awop_dble(); + Aoe=Aw.Aoe; + rw=w+nbh*Ns; + + nn=inn+nbh; + nm=inn+nb; + + for (;nn1) + set_vd2zero(nbbh*Ns,w+nb*Ns); + + Aw=Awop_dble(); + Aeo=Aw.Aeo; + rv=v+nbh*Ns; + + nn=inn+nbh; + nm=inn+nb; + + for (;nn1) + cpvd_ext_bnd(w); +} + + +void Awhat_dble(complex_dble *v,complex_dble *w) +{ + int (*nn)[8],(*nm)[8]; + complex_dble *rs,*rm; + complex_dble **Aeo,**Aoe; + Aw_dble_t Aw; + + if (Ns==0) + alloc_vs(); + + assign_vd2vd(nbh*Ns,v,w); + + if (NPROC>1) + { + set_vd2zero(nbbh*Ns,w+nb*Ns); + cpvd_int_bnd(v); + } + + Aw=Awophat_dble(); + Aoe=Aw.Aoe; + Aeo=Aw.Aeo; + + nn=inn+nbh; + nm=inn+nb; + + for (;nn1) + cpvd_ext_bnd(w); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/Aw_gen.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/Aw_gen.c new file mode 100644 index 0000000000000000000000000000000000000000..f1a9c7a99bb26c99684b1ca2617a9c2a14c1bb36 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/Aw_gen.c @@ -0,0 +1,860 @@ + +/******************************************************************************* +* +* File Aw_gen.c +* +* Copyright (C) 2007, 2008, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Generic programs needed for the computation of the little Dirac operator +* +* The externally accessible functions are +* +* void gather_ud(int vol,int *imb,su3_dble *ud,su3_dble *vd) +* Assigns the 3x3 matrices ud[imb[i]] to vd[i] (i=0,..,vol-1). +* +* void gather_sd(int vol,int *imb,spinor_dble *sd,spinor_dble *rd) +* Assigns the spinors sd[imb[i]] to rd[i] (i=0,..,vol-1). +* +* void apply_u2sd(int vol,int *imb,su3_dble *ud,spinor_dble *sd, +* spinor_dble *rd) +* Multiplies the spinors sd[imb[i]] by the 3x3 matrices ud[i] and +* assigns the result to rd[i] (i=0,..,vol-1). +* +* void apply_udag2sd(int vol,int *imb,su3_dble *ud,spinor_dble *sd, +* spinor_dble *rd) +* Multiplies the spinors sd[imb[i]] by the adjoint of the 3x3 matrices +* ud[i] and assigns the result to rd[i] (i=0,..,vol-1). +* +* The following is an array of functions indexed by the direction mu=0,..,3: +* +* void (*spinor_prod_gamma[])(int vol,spinor_dble *sd,spinor_dble *rd, +* complex_dble *sp) +* Computes the scalar products (sd,rd) and (sd,gamma_mu*rd), where +* gamma_mu denotes the Dirac matrix with index mu and the spinor +* fields are assumed to have vol elements. On exit the calculated +* products are assigned to sp[0] and sp[1], respectively. +* +* Notes: +* +* The representation of the Dirac matrices is specified in the notes +* "Implementation of the lattice Dirac operator" (file doc/dirac.pdf). +* The input and output fields may not overlap in the case of the programs +* gather_ud(), gather_sd(), apply_u2sd() and apply_udag2sd(). +* +* All these programs can be called locally. If SSE inline-assembly is used +* (i.e. if x64 is set), it is taken for granted that the field arrays are +* aligned to 16 byte boundaries. +* +*******************************************************************************/ + +#define AW_GEN_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "little.h" + +#define MAX_LEVELS 8 +#define BLK_LENGTH 8 + +static int cnt[MAX_LEVELS]; +static complex_dble sm0[MAX_LEVELS] ALIGNED16; +static complex_dble sm1[MAX_LEVELS] ALIGNED16; + + +static void init_sm(void) +{ + int n; + + for (n=0;n=BLK_LENGTH)&&(nrt) + rm=rt; + + _start_sm(); + + for (;rdrt) + rm=rt; + + _start_sm(); + + for (;rdrt) + rm=rt; + + _start_sm(); + + for (;rdrt) + rm=rt; + + _start_sm(); + + for (;rdrt) + rm=rt; + + z0.re=0.0; + z0.im=0.0; + z1.re=0.0; + z1.im=0.0; + + for (;rdrt) + rm=rt; + + z0.re=0.0; + z0.im=0.0; + z1.re=0.0; + z1.im=0.0; + + for (;rdrt) + rm=rt; + + z0.re=0.0; + z0.im=0.0; + z1.re=0.0; + z1.im=0.0; + + for (;rdrt) + rm=rt; + + z0.re=0.0; + z0.im=0.0; + z1.re=0.0; + z1.im=0.0; + + for (;rd +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "vflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "block.h" +#include "dfl.h" +#include "little.h" +#include "global.h" + +#define MAX_FROBENIUS 1.0e6 +#define MAX_UPDATE 128 + +static int Ns=0,nb,nbh,nbbh; +static int *idx,(*inn)[8]; +static int old_eo[2],nupd=0; +static double old_m0[2],old_mu[2]; +static Aw_dble_t Awd={0},Awdhat={0}; +static Aw_t Aws={0},Awshat={0}; + + +static void set_constants(void) +{ + dfl_parms_t dfl; + dfl_grid_t grd; + + dfl=dfl_parms(); + grd=dfl_geometry(); + + Ns=dfl.Ns; + nb=grd.nb; + nbh=nb/2; + nbbh=grd.nbb/2; + idx=grd.idx; + inn=grd.inn; +} + + +static void alloc_Awd(Aw_dble_t *Aw) +{ + int n,k,nmat,nbee,nboe; + complex_dble **ww,*w; + + if (Ns==0) + set_constants(); + + nmat=Ns*Ns; + nbee=0; + nboe=0; + if (Aw==(&Awd)) + nboe=nbbh; + if (Aw==(&Awdhat)) + nbee=nbbh; + n=18*nbh+nbee+2*nboe; + ww=malloc(n*sizeof(*ww)); + w=amalloc(n*nmat*sizeof(*w),ALIGN); + error((ww==NULL)||(w==NULL),1,"alloc_Awd [Aw_ops.c]", + "Unable to allocate matrix arrays"); + + for (k=0;k1) + { + dprms[0]=mu; + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + error(dprms[0]!=mu,1, + "set_Aw [Aw_ops.c]","Parameters are not global"); + } + + if (Awd.Ns==0) + alloc_Awd(&Awd); + + sw=sw_parms(); + m0=sw.m0; + tm=tm_parms(); + eo=tm.eoflg; + + if (query_flags(AW_UP2DATE)==1) + { + if ((m0!=old_m0[0])||(mu!=old_mu[0])||(eo!=old_eo[0])) + update_Awdiag(m0,mu,eo); + return; + } + + sw_term(NO_PTS); + b=blk_list(DFL_BLOCKS,&nbs,&isw); + + for (n=0;n=nbh) + { + z=Awd.Aeo[8*(msw-nbh)+(ifc^0x1)]; + w=Awd.Aoe[8*(msw-nbh)+(ifc^0x1)]; + } + else if (ibn) + { + mbd=inn[msw][ifc^0x1]-nb-nbbh; + z=Awd.Aoe[8*nbh+mbd]; + w=Awd.Aeo[8*nbh+mbd]; + } + else + { + z=Awd.Aoe[8*(nsw-nbh)+ifc]; + w=Awd.Aeo[8*(nsw-nbh)+ifc]; + } + + for (k=0;k=nbh) + { + z=Awd.Aoe[8*(nsw-nbh)+ifc]; + w=Awd.Aeo[8*(nsw-nbh)+ifc]; + } + else if (ibn) + { + nbd=inn[nsw][ifc]-nb-nbbh; + z=Awd.Aeo[8*nbh+nbd]; + w=Awd.Aoe[8*nbh+nbd]; + } + else + { + z=Awd.Aeo[8*(msw-nbh)+(ifc^0x1)]; + w=Awd.Aoe[8*(msw-nbh)+(ifc^0x1)]; + } + + for (k=0;kMAX_FROBENIUS) + ifail=1; + } + + cpAee_int_bnd(); + + for (n=0;n=nb) + m-=nbh; + + cmat_mul_dble(Ns,Awdhat.Aee[m],Awd.Aeo[8*n+ifc], + Awdhat.Aeo[8*n+ifc]); + } + } + + for (n=0;nMAX_FROBENIUS) + ifail=1; + + for (ifc=0;ifc<8;ifc++) + cmat_mul_dble(Ns,Awdhat.Aoo[n],Awd.Aoe[8*n+ifc],Awdhat.Aoe[8*n+ifc]); + } + + if (Awshat.Ns==0) + alloc_Aws(&Awshat); + assign_Awd2Aw(&Awdhat,&Awshat); + set_flags(COMPUTED_AWHAT); + + old_m0[1]=m0; + old_mu[1]=mu; + old_eo[1]=eo; + ifail|=set_ltl_modes(); + MPI_Allreduce(&ifail,&n,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD); + + return n; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/README new file mode 100644 index 0000000000000000000000000000000000000000..9bfe00eeefa082961ce6cc69e4258a5d30f2c1f5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/README @@ -0,0 +1,179 @@ + +******************************************************************************** + + Little Dirac Operator + +******************************************************************************** + + +Files +----- + +Aw_gen.c Generic programs needed for the computation of the + little Dirac operator + +Aw_com.c Communication program needed in the computation of + the little Dirac operator + +Aw_ops.c Computation of the little Dirac operator + +Aw.c Application of the single_precision little Wilson-Dirac + operator Aw + +Aw_dble.c Application of the double-precision little Wilson-Dirac + operator Aw + +ltl_modes.c Computation of the little modes + + +Include file +------------ + +The file little.h defines the prototypes for all externally accessible +functions that are defined in the *.c files listed above. + + +List of functions +----------------- + +void gather_ud(int vol,int *imb,su3_dble *ud,su3_dble *vd) + Assigns the 3x3 matrices ud[imb[i]] to vd[i] (i=0,..,vol-1). + +void gather_sd(int vol,int *imb,spinor_dble *sd,spinor_dble *rd) + Assigns the spinors sd[imb[i]] to rd[i] (i=0,..,vol-1). + +void apply_u2sd(int vol,int *imb,su3_dble *ud,spinor_dble *sd, + spinor_dble *rd) + Multiplies the spinors sd[imb[i]] by the 3x3 matrices ud[i] and + assigns the result to rd[i] (i=0,..,vol-1). + +void apply_udag2sd(int vol,int *imb,su3_dble *ud,spinor_dble *sd, + spinor_dble *rd) + Multiplies the spinors sd[imb[i]] by the adjoint of the 3x3 matrices + ud[i] and assigns the result to rd[i] (i=0,..,vol-1). + +The following is an array of functions indexed by the direction mu=0,..,3: + +void (*spinor_prod_gamma[])(int vol,spinor_dble *sd,spinor_dble *rd, + complex_dble *sp) + Computes the scalar products (sd,rd) and (sd,gamma_mu*rd), where + gamma_mu denotes the Dirac matrix with index mu and the spinor + fields are assumed to have vol elements. On exit the calculated + products are assigned to sp[0] and sp[1], respectively. + +b2b_flds_t *b2b_flds(int n,int mu) + Extracts the spinor fields on the interior boundaries of the n'th + block of the DFL_BLOCKS grid and its neighbouring block in direction + mu. The spinors on the odd sites are multiplied by the link variables + in direction mu and -mu respectively. If the two blocks touch the + boundary of the local lattice, the fields extracted from the even + sites are copied to the neighbouring process. The program returns a + structure containing the extracted field arrays (see README.Aw_com + for detailed explanations). + +void cpAoe_ext_bnd(void) + Copies the hopping terms Aoe and Aeo of the double-precision little + Dirac operator on the odd exterior boundary points of the local block + lattice to the neighbouring processes and *adds* them to the hopping + terms on the matching blocks on the target lattices. + +void cpAee_int_bnd(void) + Copies the even-even terms Aee of the double-precision little Dirac + operator on the (even) interior boundary points of the local block + lattice to the neighbouring processes. + +Aw_t Awop(void) + Returns a structure containing the matrices that describe the + single-precision little Dirac operator. + +Aw_t Awophat(void) + Returns a structure containing the matrices that describe the + single-precision even-odd preconditioned little Dirac operator. + +Aw_dble_t Awop_dble(void) + Returns a structure containing the matrices that describe the + double-precision little Dirac operator. + +Aw_dble_t Awophat_dble(void) + Returns a structure containing the matrices that describe the + double-precision even-odd preconditioned little Dirac operator. + +void set_Aw(double mu) + Computes the single- and the double-precision little Dirac operator. + The SW term is updated if needed and the twisted mass is set to mu. + If the twisted-mass flag is set, the twisted-mass term is switched + on the odd sites of the lattice. + +int set_Awhat(double mu) + Computes the single- and the double-precision even-odd preconditioned + little Dirac operator. The program calls set_Aw(mu) and thus updates + the operator w/o even-odd preconditioning too. The little modes are + updated as well (see ltl_modes.c). On exit the program returns 0 if + all matrix inversions were safe and 1 if not. + +void Aw(complex *v,complex *w) + Applies the little Dirac operator to the field v and assigns the + result to the field w. + +void Aweeinv(complex *v,complex *w) + Applies the inverse of the even-even part of the little Dirac operator + to the field v and assigns the result to the field w on the even blocks. + On the odd blocks, w is unchanged. + +void Awooinv(complex *v,complex *w) + Applies the inverse of the odd-odd part of the little Dirac operator + to the field v and assigns the result to the field w on the odd blocks. + On the even blocks, w is unchanged. + +void Awoe(complex *v,complex *w) + Applies the odd-even part of the little Dirac operator to the field v + and assigns the result to the field w on the odd blocks. On the even + blocks, w is unchanged. + +void Aweo(complex *v,complex *w) + Applies the even-odd part of the little Dirac operator to the field v + and *subtracts* the result from the field w on the even blocks. On the + odd blocks, w is unchanged. + +void Awhat(complex *v,complex *w) + Applies the even-odd preconditioned little Dirac operator to the field + v and assigns the result to the field w on the even blocks. On the odd + blocks, w is unchanged. + +void Aw_dble(complex_dble *v,complex_dble *w) + Applies the little Dirac operator to the field v and assigns the + result to the field w. + +void Aweeinv_dble(complex_dble *v,complex_dble *w) + Applies the inverse of the even-even part of the little Dirac operator + to the field v and assigns the result to the field w on the even blocks. + On the odd blocks, w is unchanged. + +void Awooinv_dble(complex_dble *v,complex_dble *w) + Applies the inverse of the odd-odd part of the little Dirac operator + to the field v and assigns the result to the field w on the odd blocks. + On the even blocks, w is unchanged. + +void Awoe_dble(complex_dble *v,complex_dble *w) + Applies the odd-even part of the little Dirac operator to the field v + and assigns the result to the field w on the odd blocks. On the even + blocks, w is unchanged. + +void Aweo_dble(complex_dble *v,complex_dble *w) + Applies the even-odd part of the little Dirac operator to the field v + and *subtracts* the result from the field w on the even blocks. On the + odd blocks, w is unchanged. + +void Awhat_dble(complex_dble *v,complex_dble *w) + Applies the even-odd preconditioned little Dirac operator to the field + v and assigns the result to the field w on the even blocks. On the odd + blocks, w is unchanged. + +int set_ltl_modes(void) + Computes the little modes, the associated little-little Dirac + operator and its inverse. The program returns 0 if the inversion + was safe and 1 if not. + +complex_dble *ltl_matrix(void) + Returns the pointer to an Ns x Ns matrix that represents the + *inverse* of the double-precision little-little Dirac operator. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/README.Aw b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/README.Aw new file mode 100644 index 0000000000000000000000000000000000000000..63539d6b1c81b151532ec80c58b86fd8305decb3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/README.Aw @@ -0,0 +1,149 @@ + +******************************************************************************** + + Definition of the little Dirac operator + +******************************************************************************** + +The little Dirac operator was introduced in + + M. Luescher: "Local coherence and deflation of the low quark modes + in lattice QCD", JHEP 0707 (2007) 081 + +Here the data structures used to represent the operator are described. + + +Definition +---------- + +The deflation subspace is spanned by the fields (*b).sd[1],..,(*b).sd[Ns] on +the blocks b of the DFL_BLOCKS grid. When the subspace is created, the basis +fields are orthonormalized on each block. The restriction of the Wilson-Dirac +operator Dw+i*mu*gamma_5 to the deflation subspace is referred to as the +little Dirac operator. It is completely specified by the matrix elements + + A_{n,k;m,l}=(v_{n,k},(Dw+i*mu*gamma_5)*v_{m,l}) + +where v_{n,0},v_{n,1},..,v_{n,Ns-1} are the basis vectors on the block with +index n. + + +Matrix arrays +------------- + +The DFL_BLOCKS grid consists of the sublattices of the even and odd blocks +(see dfl/dfl_geometry.c). In each local lattice, there are nb blocks, half of +which are even and half odd. The number nbb of blocks on the exterior boundary +of the local lattice also divides into equal numbers of even and odd blocks. + +The matrix A_{n,k;m,l} decomposes into four parts Aee, Aoo, Aoe and Aeo in the +obvious way. Each of these parts may be stored in the form of one-dimensional +arrays of complex Ns x Ns matrices. Explicitly + + Aee[n][Ns*k+l] = (v_{n,k},(Dw+i*mu*gamma_5)*v_{n,l}), + + Aoo[n][Ns*k+l] = (v_{m,k},(Dw+i*mu*gamma_5)*v_{m,l}), m=n+nb/2, + + Aoe[8*n+ifc][Ns*k+l] = (v_{m,k},(Dw+i*mu*gamma_5)*v_{inn[m][ifc],l}), + + Aeo(8*n+ifc][Ns*k+l] = (v_{inn[m][ifc],k},(Dw+i*mu*gamma_5)*v_{m,l}), + +where n=0,..,nb/2-1 labels the even blocks, m=nb/2,..,nb-1 the odd blocks, +ifc=0,..,7 the 8 coordinate directions -0,+0,..,-3,+3, while inn[m][ifc] is +the index of the block in direction ifc of the block with index m. + +In the case of the double-precision operator, the length of the arrays Aoe and +Aeo is 4*nb+nbb/2 rather than 4*nb. The additional nbb/2 elements at end of +the arrays are used at intermediate stages of the computations as buffers for +the matrices on the odd exterior boundary points of the block lattice. These +are stored in the order of the boundary points (see dfl/dfl_geometry.c and +README.Aw_com). + + +Even-odd flag +------------- + +The even-odd flag can be set and unset by calling set_tm_parms() (see +flags/lat_parms.c). Initially the flag is not set. + +The programs for the Dirac operator and thus those that construct the little +Dirac operator apply the twisted mass term i*mu*gamma_5 on the even sites of +the lattice only if the flag is set. The associated deflation projectors are +suitable for the solution of the Dirac operator with such a twisted-mass term. + + +Data structure +-------------- + +The single- and double-precision arrays representing the little Dirac operator +are collected in the structures Aw_t and Aw_dble_t (see include/little.h). The +elements of these structures are + + Ns,nb + Aee[nb/2][Ns*Ns] + Aoo[nb/2][Ns*Ns] + Aoe[4*nb][Ns*Ns] + Aeo[4*nb][Ns*Ns] + +As already mentioned, the length of the last two arrays is 4*nb+nbb/2 +rather than 4*nb in the case of the double precision operator. + + +Even-odd preconditioned operator +-------------------------------- + +The even-odd preconditioned little operator Ahat acts on fields supported on +the even blocks. It is related to the little operator A through + + Ahat=1-Aee^(-1)*Aeo*Aoo^(-1)*Aoe + +The preconditioned operator may be represented by Aw_t and Aw_dble_t +structures containing the matrix arrays + + Aee^(-1), Aoo^(-1), Aee^(-1)*Aeo and Aoo^(-1)*Aoe + +instead of Aee, Aoo, Aoe and Aeo. + +In the case of the double-precision preconditioned operator, the array of the +even-even terms is of length nb/2+nbb/2 instead of nb/2. The additional nbb/2 +elements at end of the array are used as buffers for the matrices on the even +interior boundary points of the block lattice (see README.Aw_com). + + +Little-little Dirac operator +---------------------------- + +The deflation subspace is constructed by projecting Ns global spinor fields to +the blocks of the DFL_BLOCKS grid. These global fields are linear combinations +of the basis fields v_{n,k} and span a subspace of dimension Ns within the +deflation subspace (which has dimension nb*Ns). + +The even-odd preconditioned little Dirac operator may be deflated using the +restriction of the global modes to the even blocks as the deflation modes. +These fields (which are also contained in the deflation subspace) are referred +to as the little modes, and the restriction of the even-odd preconditioned +little Dirac operator to the space spanned by them as the little-little Dirac +operator. Its action is completely specified by its matrix elements in the +space of the little modes, i.e. by a complex Ns x Ns matrix (the program +set_ltl_modes() orthonormalizes the little modes before the little-little +Dirac operator is calculated). + +The single-precision little modes md_k (k=0,..,Ns-1) and Awhat*md_k are stored +in the first and second half of the first Ns fields returned by vflds(). The +double-precision fields are stored in the same way in the Ns fields returned +by vdflds(). + + +Boundary conditions +------------------- + +In the case of boundary conditions of type 0,1 and 2, the hopping terms Aeo +and Aoe that go across the boundaries of the lattice at global time 0 and +NPROC0*L0-1 are equal to zero. + +The programs in this directory obtain the little Dirac operator always in the +same way and thus effectively as if periodic boundary conditions were imposed +in the time direction. Since the quark fields vanish at time 0 when boundary +conditions of type 0,1 or 2 are chosen, the calculation gives the correct +result (i.e. vanishing hopping terms across the lattice boundaries) also in +these cases. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/README.Aw_com b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/README.Aw_com new file mode 100644 index 0000000000000000000000000000000000000000..de5dbe03877483fc99fa19efffa3b3ff55d4c4bd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/README.Aw_com @@ -0,0 +1,204 @@ + +******************************************************************************** + + Calculation of the little Dirac operator + +******************************************************************************** + +The computation of the matrix arrays Aoe and Aeo representing the hopping +terms of the little Dirac operator require the spinor fields at the interior +boundaries of the local lattice to be communicated to the neighbouring +lattices. Some communications are also required when the matrices representing +the even-odd preconditioned operator are calculated. + + +Extraction of boundary fields +----------------------------- + +The computation of the matrix elements Aoe and Aeo involves a computation of +the scalar products of spinor fields residing at the interior boundary points +of the blocks in the DFL_BLOCKS grid. If b0 and b1 are neighbouring blocks +with indices n0 and n1, respectively, where b1 is displaced from b0 in the +positive direction mu, the geometrical situation is as follows: + + -----> x_mu + + ----- ----- + e | | o + e | | o + o | | e + o | | e + ----- ----- + block b0 block b1 + +Here "e" and "o" denote even and odd interior boundary points. The scalar +products to be computed are then + + sp[0][Ns*k+l] = -1/2*(v_{n0,k},(1-gamma_mu)*U*v_{n1,l}), + + sp[1][Ns*k,l] = -1/2*(v_{n0,k},(1+gamma_mu)*U*v_{n1,l}), + +where v_{n0,k} and v_{n1,l} (k,l=0,..,Ns-1) are the deflation modes on block +b0 and b1 respectively. In these scalar products, one sums over the interface +points only and "U" stands for the link variables across the interface. + +It is helpful to split the sum in the scalar products in two sums, one going +over the (e,o) pairs of points and the other over the (o,e) pairs (see the +figure above). The computation then proceeds by first extracting + + psi_{k,e}=v_{n0,k}_e, chi_{l,e}=v_{n1,l}_e, + + psi_{k,o}=U^dag*v_{n0,k}_o, chi_{l,o}=U*v_{n1,l}_o. + +Once this is done, the scalar products + + (psi_{k,e},chi_{l,o}), (psi_{k,e},gamma_mu*chi_{l,o}), + + (psi_{k,o},chi_{l,e}), (psi_{k,o},gamma_mu*chi_{l,e}), + +may be calculated, from which the matrices sp[0] and sp[1] are obtained by +taking simple linear combinations. + + +Communication of spinor fields +------------------------------ + +When the block b0 touches the boundary of the local lattice in direction mu, +the neighbouring block b1 is on the neighbouring MPI process. At the same time +the local lattice contains another block b1', with index n1', on the opposite +face of the local lattice, which is the neighbour in direction mu of the n0'th +block on the process in direction -mu: + + + -------- ---------------------- -------- + | | | | + | | | | + | | | | + | | | | + *****| |***** ***** | | ***** + * *| |* * * * | | * * + *****| |***** ***** | | ***** + b0' | | b1' b0 | | b1 + | | | | + | | | | + -------- ---------------------- -------- + + +Before the scalar products can be computed, some fields need to be moved from +and to the neighbouring processes. The program b2b_flds() moves + + psi_{k,e} from b0 to b1 and + + chi_{l,e} from b1 to b0 + +across the interface that separates b0 from b1. Note that b1' is the neighbour +of b0' on the local lattice to the left. Exchanges of fields across that +boundary are performed as in the case of the b0,b1 pair of blocks. + + +Elements of the b2b_flds_t structure +------------------------------------ + +The b2b_flds_t structure returned by the program b2b_flds() contain the +following data: + + n[2] n[0]=n0. n[1]=n1 or n1' depending on whether + b1 is on the local lattice or not. + + vol Number of points on the interface. + + ibn Indicates whether b1 is on a different + local lattice (ibn=1) or not (ibn=0). + + sde[2][Ns][vol] Extracted field arrays. + sdo[2][Ns][vol] + +The contents of the field arrays depends on whether a communication was needed +or not: + +ibn=0 (no communication): + + sde[0][k] = psi_{k,e} + sde[1][l] = chi_{l,e} + + sdo[0][k] = psi_{k,o} + sdo[1][l] = chi_{l,o} + +ibn=1: + + sde[0][k] = psi_{k,e}' (field communicated from b0') + sde[1][l] = chi_{l,e} (field communicated from b1) + + sdo[0][k] = psi_{k,o} (field extracted from b0) + sdo[1][l] = chi_{l,o}' (field extracted from b1') + + +Computation of scalar products +------------------------------ + +The calculation of the hopping terms Aoe and Aeo proceeds by running through +all block pairs b0,b1, extracting the boundary fields using b2b_flds() and +calculating the scalar products of the extracted fields. In the case of the +block pairs with ibn=0, the extracted fields are exactly those required for +these scalar products. However, if ibn=1, the scalar products that can be +formed (without further communication) are + + (psi_{k,e}',chi_{l,o}'), (psi_{k,e}',gamma_mu*chi_{l,o}') + +and + + (psi_{k,o},chi_{l,e}), (psi_{k,o},gamma_mu*chi_{l,e}). + +The first of these contribute to the hopping terms Aoe,Aeo to/from b0',b1' +and the second to those to/from b0,b1. + + +Assignment of the hopping terms +------------------------------- + +The calculated scalar products finally need to be assigned to the arrays Aoe +and Aeo in the data structures that define the little Dirac operator (see +README.Aw). In doing so, one should take into account that the labeling of the +blocks, as used in the description of the geometry of the DFL_BLOCKS grid, is +not guaranteed to coincide with the ordering of the blocks in block list +returned by blk_list(). The geometric label of the n'th block in the list is + + nsw=grd.idx[n], + +where grd=dfl_geometry() is the structure containing the grid geometry arrays. +The ordering of the matrices in the arrays Aoe and Aeo is the geometric one, +while the program b2b_flds() uses the natural ordering in the block list. + +The mapping of the scalar products is thus + +ibn=0: + + (psi_{k,o},chi_{l,e}), .. are assigned to Aoe[m+ifc],Aeo[m+ifc] where + m=grd.idx[n0] and ifc=2*mu+1 if b0 is odd or + m=grd.idx[n1] and ifc=2*mu if b0 is even. + +ibn=1: + + (psi_{k,o},chi_{l,e}), .. are assigned to Aoe[m+ifc],Aeo[m+ifc] where + m=grd.idx[n0] and ifc=2*mu+1 if b0 is odd. + + (psi_{k,o}',chi_{l,e}'), .. are assigned to Aoe[m+ifc],Aeo[m+ifc] where + m=grd.idx[n1'] and ifc=2*mu if b1' is odd. + + If b0 and/or b1' is even, the scalar products must be copied to the + neighbouring processes in direction +mu and -mu respectively. They are + first assigned to matrices at the end of the Aoe and Aeo arrays and + eventually (after all pairs of blocks are processed) are communicated + by the program cpAoe_ext_bnd(). + + +Even-odd preconditioned operator +-------------------------------- + +As explained in README.Aw, the even-odd preconditioned little Dirac operator +requires the computation of the products Aee^(-1)*Aeo and Aoo^(-1)*Aoe. All +matrices in the second product are locally available, but the first product +can only be formed after communicating the matrices Aee^(-1) residing at the +interior boundary of the local block lattice to the neighbouring processes. +The program cpAee_int_bnd() does that along the lines of the communication +programs for complex and spinor fields (see vflds/vdcom.c and sflds/sdcom.c). diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/ltl_modes.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/ltl_modes.c new file mode 100644 index 0000000000000000000000000000000000000000..b02992351c7f3d9a61ea37ff1b26c0e01d1bda77 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/ltl_modes.c @@ -0,0 +1,189 @@ + +/******************************************************************************* +* +* File ltl_modes.c +* +* Copyright (C) 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Computation of the little modes. +* +* The externally accessible functions are +* +* int set_ltl_modes(void) +* Computes the little modes, the associated little-little Dirac +* operator and its inverse. The program returns 0 if the inversion +* was safe and 1 if not. +* +* complex_dble *ltl_matrix(void) +* Returns the pointer to an Ns x Ns matrix that represents the +* *inverse* of the double-precision little-little Dirac operator. +* +* Notes: +* +* For a description of the little Dirac operator and the associated data +* structures see README.Aw. As usual, Ns denotes the number of deflation +* modes in each block of the DFL_BLOCKS grid. +* +* The inversion of a double-precision complex matrix is considered to be +* safe if and only if its Frobenius condition number is less than 10^6. +* +* All programs in this module may involve global communications and must +* be called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define LTL_MODES_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "vflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "block.h" +#include "dfl.h" +#include "little.h" +#include "global.h" + +#define MAX_FROBENIUS 1.0e6 + +static int Ns=0,nv,nvh; +static complex **vs; +static complex_dble **vds,*Ads,*Bds,*Cds; + + +static void sum_vprod(int n,complex_dble *z,complex_dble *w) +{ + int k; + + if (NPROC>1) + { + MPI_Reduce(z,w,2*n,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(w,2*n,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + { + for (k=0;k0) + { + for (l=0;lMAX_FROBENIUS) + ifail=1; + + return ifail; +} + + +complex_dble *ltl_matrix(void) +{ + if (Ns==0) + alloc_matrices(); + + return Bds; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/mdflds/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/mdflds/README new file mode 100644 index 0000000000000000000000000000000000000000..20f58d885dc507bd84453c269da4d04242a5dc9c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/mdflds/README @@ -0,0 +1,59 @@ + +******************************************************************************** + + Molecular-dynamics auxiliary fields + +******************************************************************************** + + +Files +----- + +fcom.c Communication of the force variables residing at the + boundaries of the local lattices. + +mdflds.c Allocation and initialization of the MD auxiliary fields. + + + +Include file +------------ + +The file mdflds.h defines the prototypes for all externally accessible +functions that are defined in the *.c files listed above. + + +List of functions +----------------- + +void copy_bnd_frc(void) + Copies the force variables on the boundaries of the local lattice + from the neighbouring processes. The force variables on the spatial + links at time T are fetched only in the case of periodic boundary + conditions. + +void add_bnd_frc(void) + Adds the values of the force variables on the boundaries of the + local lattice to the force field on the neighbouring processes. + The force variables on the spatial links at time T are added only + in the case of periodic boundary conditions. + +mdflds_t *mdflds(void) + Returns the pointer to a mdflds_t structure containing the force and + momentum field. The fields are automatically allocated if needed. + +void set_frc2zero(void) + Sets all force variables, including those on the boundary, to zero. + +void bnd_mom2zero(void) + Sets the components of the momentum field on the static links + to zero (see the notes). + +void random_mom(void) + Sets the elements X of the momentum field on the active links to + random values with distribution proportional to exp(tr{X^2}). On + the static links the field is set to zero (see the notes). + +double momentum_action(int icom) + Returns the action of the momentum field. The action is summed + over all MPI processes if (and only if) icom=1. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/mdflds/fcom.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/mdflds/fcom.c new file mode 100644 index 0000000000000000000000000000000000000000..6265fb8170b330ae64d2115fb89657aa2da232fb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/mdflds/fcom.c @@ -0,0 +1,410 @@ + +/******************************************************************************* +* +* File fcom.c +* +* Copyright (C) 2010, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Communication of the force variables residing at the exterior boundaries +* of the local lattices. +* +* The externally accessible functions are +* +* void copy_bnd_frc(void) +* Copies the force variables from the neighbouring MPI processes to +* the exterior boundaries of the local lattice. The field variables +* on the spatial links at time NPROC0*L0 are fetched only in the case +* of periodic boundary conditions. +* +* void add_bnd_frc(void) +* Adds the force variables on the exterior boundaries of the local +* lattice to the field variables on the neighbouring MPI processes. +* The field variables on the spatial links at time NPROC0*L0 are +* added only in the case of periodic boundary conditions. +* +* Notes: +* +* The force field is the one returned by mdflds(). Its elements are ordered +* in the same way as those of the global gauge fields (see main/README.global +* and lattice/README.uidx). +* +* The programs in this module perform global communications and must be +* called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define FCOM_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "mdflds.h" +#include "global.h" + +static int bc,np; +static const su3_alg_dble fd0={0.0}; +static su3_alg_dble *sbuf_f0=NULL,*sbuf_fk,*rbuf_f0,*rbuf_fk; +static mdflds_t *mdfs; +static uidx_t *idx; + + +static void alloc_frcbufs(void) +{ + int ib; + + bc=bc_type(); + np=(cpr[0]+cpr[1]+cpr[2]+cpr[3])&0x1; + mdfs=mdflds(); + idx=uidx(); + + sbuf_f0=amalloc(7*(BNDRY/4)*sizeof(*sbuf_f0),ALIGN); + error(sbuf_f0==NULL,1,"alloc_frcbufs [fcom.c]", + "Unable to allocate communication buffers"); + + sbuf_fk=sbuf_f0+(BNDRY/4); + rbuf_f0=(*mdfs).frc+4*VOLUME; + rbuf_fk=rbuf_f0+(BNDRY/4); + + for (ib=0;ib<(7*(BNDRY/4));ib++) + sbuf_f0[ib]=fd0; +} + + +static void pack_f0(void) +{ + int mu,nu0; + int *iu,*ium; + su3_alg_dble *f,*fb; + + fb=(*mdfs).frc; + f=sbuf_f0; + + for (mu=0;mu<4;mu++) + { + nu0=idx[mu].nu0; + + if (nu0>0) + { + iu=idx[mu].iu0; + ium=iu+nu0; + + for (;iu0) + { + if ((mu>0)||(cpr[0]>0)||(bc==3)) + { + iu=idx[mu].iuk; + ium=iu+nuk; + + for (;iu0) + { + tag=mpi_tag(); + saddr=npr[2*mu]; + raddr=npr[2*mu+1]; + nbf=8*nuk; + + if (np==0) + { + if ((mu>0)||(cpr[0]>0)||(bc==3)) + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + if ((mu>0)||(cpr[0]<(NPROC0-1))||(bc==3)) + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + if ((mu>0)||(cpr[0]<(NPROC0-1))||(bc==3)) + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + if ((mu>0)||(cpr[0]>0)||(bc==3)) + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + + sbuf+=nuk; + rbuf+=nuk; + } + } +} + + +void copy_bnd_frc(void) +{ + if (NPROC>1) + { + if (sbuf_f0==NULL) + alloc_frcbufs(); + + pack_f0(); + fwd_send_f0(); + pack_fk(); + fwd_send_fk(); + } +} + + +static void bck_send_f0(void) +{ + int mu,nu0,nbf; + int tag,saddr,raddr; + su3_alg_dble *sbuf,*rbuf; + MPI_Status stat; + + sbuf=rbuf_f0; + rbuf=sbuf_f0; + + for (mu=0;mu<4;mu++) + { + nu0=idx[mu].nu0; + + if (nu0>0) + { + tag=mpi_tag(); + saddr=npr[2*mu+1]; + raddr=npr[2*mu]; + nbf=8*nu0; + + if (np==0) + { + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + + sbuf+=nu0; + rbuf+=nu0; + } + } +} + + +static void bck_send_fk(void) +{ + int mu,nuk,nbf; + int tag,saddr,raddr; + su3_alg_dble *sbuf,*rbuf; + MPI_Status stat; + + sbuf=rbuf_fk; + rbuf=sbuf_fk; + + for (mu=0;mu<4;mu++) + { + nuk=idx[mu].nuk; + + if (nuk>0) + { + tag=mpi_tag(); + saddr=npr[2*mu+1]; + raddr=npr[2*mu]; + nbf=8*nuk; + + if (np==0) + { + if ((mu>0)||(cpr[0]<(NPROC0-1))||(bc==3)) + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + if ((mu>0)||(cpr[0]>0)||(bc==3)) + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + if ((mu>0)||(cpr[0]>0)||(bc==3)) + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + if ((mu>0)||(cpr[0]<(NPROC0-1))||(bc==3)) + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + + sbuf+=nuk; + rbuf+=nuk; + } + } +} + + +static void add_f0(void) +{ + int mu,nu0; + int *iu,*ium; + su3_alg_dble *f,*fb,*frc; + + fb=(*mdfs).frc; + f=sbuf_f0; + + for (mu=0;mu<4;mu++) + { + nu0=idx[mu].nu0; + + if (nu0>0) + { + iu=idx[mu].iu0; + ium=iu+nu0; + + for (;iu0) + { + if ((mu>0)||(cpr[0]>0)||(bc==3)) + { + iu=idx[mu].iuk; + ium=iu+nuk; + + for (;iu1) + { + if (sbuf_f0==NULL) + alloc_frcbufs(); + + bck_send_fk(); + add_fk(); + bck_send_f0(); + add_f0(); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/mdflds/mdflds.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/mdflds/mdflds.c new file mode 100644 index 0000000000000000000000000000000000000000..7c5926f3ee8be3fa4eadc011f238a9c74d600d94 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/mdflds/mdflds.c @@ -0,0 +1,201 @@ + +/******************************************************************************* +* +* File mdflds.c +* +* Copyright (C) 2011, 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Allocation and initialization of the MD auxiliary fields. +* +* The externally accessible functions are +* +* mdflds_t *mdflds(void) +* Returns the pointer to a mdflds_t structure containing the force and +* momentum field. The fields are automatically allocated if needed. +* +* void set_frc2zero(void) +* Sets all force variables, including those on the boundary, to zero. +* +* void bnd_mom2zero(void) +* Sets the components of the momentum field on the static links +* to zero (see the notes). +* +* void random_mom(void) +* Sets the elements X of the momentum field on the active links to +* random values with distribution proportional to exp(tr{X^2}). On +* the static links the field is set to zero (see the notes). +* +* double momentum_action(int icom) +* Returns the action of the momentum field. The action is summed +* over all MPI processes if (and only if) icom=1. +* +* Notes: +* +* The arrays *.mom and *.frc in the structure returned by mflds() are the +* molecular-dynamics momentum and force fields. Their elements are ordered +* in the same way as the link variables (see main/README.global). Moreover, +* the force field includes space for 7*(BNDRY/4) additional links as do the +* gauge fields (see lattice/README.uidx). +* +* Before the momentum and force fields are allocated, the geometry arrays +* must be set. The sets of static and active links depend on the chosen +* boundary conditions. Only the field variables on the active links are +* updated in the simulations. +* +* The number npf of pseudo-fermion fields is retrieved from the parameter +* data base (see flags/hmc_parms.c). It is thus assumed that npf has been +* set when the programs in this module are called for the first time (the +* field array is otherwise set to NULL). +* +* Pseudo-fermion fields are of the same size NSPIN as other quark fields. +* In the structure returned by mdflds(), the address of the pseudo-fermion +* field with index ipf is *.pf[ipf]. +* +* The programs potentially perform global operations and must be called +* simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define MDFLDS_C + +#include +#include +#include +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "linalg.h" +#include "mdflds.h" +#include "global.h" + +static const su3_alg_dble md0={0.0}; +static mdflds_t *mdfs=NULL; + + +static void alloc_mdflds(void) +{ + int npf,ipf; + su3_alg_dble *mom; + spinor_dble **pp,*p; + hmc_parms_t hmc; + + error_root(sizeof(su3_alg_dble)!=(8*sizeof(double)),1, + "alloc_mdflds [mdflds.c]", + "The su3_alg_dble structures are not properly packed"); + + error(iup[0][0]==0,1,"alloc_mdflds [mdflds.c]", + "The geometry arrays are not set"); + + mdfs=malloc(sizeof(*mdfs)); + mom=amalloc((8*VOLUME+7*(BNDRY/4))*sizeof(*mom),ALIGN); + error((mdfs==NULL)||(mom==NULL),1,"alloc_mdflds [mdflds.c]", + "Unable to allocate momentum and force fields"); + + set_alg2zero(8*VOLUME+7*(BNDRY/4),mom); + (*mdfs).mom=mom; + (*mdfs).frc=mom+4*VOLUME; + + hmc=hmc_parms(); + npf=hmc.npf; + + if (npf>0) + { + pp=malloc(npf*sizeof(*pp)); + p=amalloc(npf*NSPIN*sizeof(*p),ALIGN); + error((pp==NULL)||(p==NULL),1,"alloc_mdflds [mdflds.c]", + "Unable to allocate pseudo-fermion fields"); + set_sd2zero(npf*NSPIN,p); + + for (ipf=0;ipf=0. An error occurs if x is negative + +void ks_test(int n,double f[],double *pkp,double *pkm) + For a given array f[0],f[1],...,f[n-1], the program calculates + the Kolmogorov-Smirnov statistics K_n^{+}=*pkp and K_n^{-}=*pkm + +void ks_prob(int n,double kp,double km,double *pp,double *pm) + Computes the approximate probabilites *pp and *pm for the Kolmogorov- + Smirnov statistics K_n^{+} and K_n^{-} to be less than or equal to + kp and km respectively (eq.(4) in the notes). + +double pchi_square(double chi_square,int nu) + For chi_square>=0 and nu=1,2,...,1000 the program returns an + approximation for P(chi_square|nu) which deviates from the exact + distribution by less than 10^(-8) [10^(-9) if nu=1] + +double average(int n,double *a) + Returns the average of the array elements a[0],..,a[n-1] + +double sigma0(int n,double *a) + Returns the naive statistical error of the average of the array + elements a[0],..,a[n-1] + +double auto_corr(int n,double *a,int tmax,double *g) + Computes the normalized autocorrelation function g[t] at time + separations t=0,..,tmax-1 of the sequence a[0],..,a[n-1] and + returns the value of the (unnormalized) autocorrelation function + at t=0. The inequality tmax<=n must be respected + +void sigma_auto_corr(int n,double *a,int tmax,int lambda,double *eg) + Computes the statistical error eg[t] at time t=0,..,tmax-1 of the + normalized autocorrelation function of the sequence a[0],..,a[n-1]. + The choice of the summation cutoff lambda is not critical, but it + should be set to a value not smaller than a few times the integrated + autocorrelation time of the sequence (see the notes below). The + inequality 2*tmax+lambda-1<=n must be respected + +double tauint(int n,double *a,int tmax,int lambda,int *w,double *sigma) + Returns an estimate of the integrated autocorrelation time of the + sequence a[0],..,a[n-1]. On exit the summation window determined by + the program is assigned to *w and an estimate of the statistical + error on the calculated autocorrelation time is assigned to *sigma. + The parameter tmax sets an upper limit on the summation window and + the summation cutoff lambda should be set to a value not smaller than + a few times the integrated autocorrelation time (see the notes below). + The inequality 2*tmax+lambda-1<=n must be respected + +double print_auto(int n,double *a) + Prints a table of the approximate integrated auto-correlation time + tau(w)=1/2+sum_{t=1}^w g[t] and the associated statistical error + sigma(w)=sigma0*sqrt{2*tau(w)}, where g[t] denotes the normalized + autocorrelation function of the sequence a[0],..,a[n-1]. On exit + the program returns the average of the array elements + +double jack_err(int nx,int n,double **a,double (*f)(int nx,double *x), + int bmax,double *sig) + Computes the standard estimate of an arbitrary function f() of + nx primary stochastic variables x[k], k=0,..,nx-1, for a given + sequence a[k][0],..,a[k][n-1] of values of these. The associated + jackknife errors sig[bs-1] for bin size bs=1,..,bmax are also + computed. On exit the program returns the standard estimate of + the function f() + +double print_jack(int nx,int n,double **a,double (*f)(int nx,double *x)) + Prints a table of the jackknife errors calculated by the program + jack_err(), together with the estimated integrated autocorrelation + times, as a function of the bin size bs. On exit the program returns + the standard estimate of the function f() diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/chebyshev.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/chebyshev.c new file mode 100644 index 0000000000000000000000000000000000000000..a75363d31ddf95fff0c0c5fd629ab2df9b811ff1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/chebyshev.c @@ -0,0 +1,376 @@ + +/******************************************************************************* +* +* File chebyshev.c +* +* Copyright (C) 2005, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Chebyshev approximation and integration +* +* The externally accessible functions are +* +* int cheby_fit(double a,double b,double (*f)(double x), +* int nmax,double eps,double c[]) +* Computes the coefficients c[0],...,c[n], with n<=nmax being the +* value returned by the program and eps the desired absolute precision +* of the approximation +* +* double cheby_val(double a,double b,int n,double c[],double x) +* Computes the value of the Chebyshev approximation at x, assuming +* the coefficients c_k are stored in the array c[0],...,c[n] +* +* double cheby_int(double a,double b,double (*f)(double x), +* int nmax,double eps) +* Computes the definite integral of f(x) in the range a<=x<=b to an +* absolute precision eps, using Chebyshev polynomials of degree n<=nmax +* +* Notes: +* +* For the numerical approximation and integration of a given function f(x), +* using the Chebyshev polynomials +* +* T_k(z)=cos(k*theta), z=cos(theta), -1<=z<=1, +* +* the function is assumed to be defined in the range a<=x<=b and to be +* available as a function program. The approximation is then of the form +* +* f(x)=sum{c_k*T_k(z),k=0..n}+r(x), z=(a+b-2*x)/(a-b) +* +* |r(x)| +#include +#include +#include +#include "extras.h" + +static int max_degree; +static double *alist,*clist,*flist; + + +static void allocate_arrays(int nmax) +{ + for (max_degree=16;max_degree<=nmax;) + max_degree*=2; + + alist=malloc((max_degree+1)*sizeof(double)); + clist=malloc((max_degree*2)*sizeof(double)); + flist=malloc((max_degree+1)*sizeof(double)); +} + + +static void free_arrays(void) +{ + free(alist); + free(clist); + free(flist); +} + + +static void update_clist(int n) +{ + int k,kmin,kmax,dk; + double pi,x,dx; + + pi=4.0*atan(1.0); + dx=pi/(double)(max_degree); + + kmin=0; + kmax=2*max_degree; + dk=max_degree/n; + + if (n>32) + { + kmin=dk; + dk*=2; + } + + for (k=kmin;k32) + { + kmin=dk; + kmax-=dk; + dk*=2; + } + + for (k=kmin;k<=kmax;k+=dk) + { + x=0.5*(a+b-(a-b)*clist[k]); + flist[k]=(*f)(x); + } +} + + +static void compute_alist(int n) +{ + int i,k,dk; + double sum,r; + + dk=max_degree/n; + r=2.0/(double)n; + + for (i=0;i<=n;++i) + { + sum=0.5*(flist[0]+flist[max_degree]); + if (i%2==1) + sum-=flist[max_degree]; + + for (k=dk;km[i]) + m[i]=a; + } + } + + if ((m[0]>=1.0e2*m[1])&&(m[0]>=1.0e4*m[2])&&(m[0]>=1.0e6*m[3])) + return(0); + + return(1); +} + + +static double abs_error(int n) +{ + int k,kmin; + double err; + + kmin=n/2+1; + err=0.0; + + for (k=0;k=1;--k) + { + r+=fabs(c[k]); + if (r>=eps) + break; + } + + return(k); +} + + +int cheby_fit(double a,double b,double (*f)(double x), + int nmax,double eps,double c[]) +{ + int n,k,itest; + double err; + + if ((a>=b)||(nmax<16)||(eps<=0.0)) + { + printf("Error in cheby_fit\n"); + printf("Arguments out of range\n"); + printf("Program aborted\n\n"); + exit(0); + } + + itest=1; + err=eps; + allocate_arrays(nmax); + + for (n=32;n<=max_degree;n*=2) + { + update_clist(n); + update_flist(n,a,b,f); + compute_alist(n); + + itest=test_convergence(n); + err=abs_error(n); + + if ((itest==0)&&(err=eps)) + { + printf("Error in cheby_fit\n"); + printf("Specified accuracy has not been reached\n"); + printf("Program aborted\n\n"); + exit(0); + } + + n=economize(n,eps,err,c); + return(n); +} + + +double cheby_val(double a,double b,int n,double c[],double x) +{ + int k; + double u,v,w,z; + + if ((n<0)||(a>=b)||(x>b)||(x=0;--k) + { + w=z*u+v; + v=c[k]-u; + u=w; + } + + return(0.5*z*u+v); +} + + +double cheby_int(double a,double b,double (*f)(double x), + int nmax,double eps) +{ + int n,k,itest; + double err,sum; + + if ((a>=b)||(nmax<16)||(eps<=0.0)) + { + printf("Error in cheby_int\n"); + printf("Arguments out of range\n"); + printf("Program aborted\n\n"); + exit(0); + } + + itest=1; + err=eps; + sum=0.0; + allocate_arrays(nmax); + + for (n=32;n<=max_degree;n*=2) + { + update_clist(n); + update_flist(n,a,b,f); + compute_blist(n,a,b); + + itest=test_convergence(n); + err=abs_error(n); + + if ((itest==0)&&(err=0;k-=2) + sum+=alist[k]; + break; + } + } + + free_arrays(); + + if ((itest!=0)||(err>=eps)) + { + printf("Error in cheby_int\n"); + printf("Specified accuracy has not been reached\n"); + printf("Program aborted\n\n"); + exit(0); + } + + return(sum); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/fsolve.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/fsolve.c new file mode 100644 index 0000000000000000000000000000000000000000..2890e9a60185782498426b5e7d60bca032d6a8f1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/fsolve.c @@ -0,0 +1,543 @@ + +/******************************************************************************* +* +* File fsolve.c +* +* Copyright (C) 2008, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* General purpose equation solver and function minimizers +* +* The externally accessible functions are +* +* double inverse_fct(double y,double x1,double x2,double (*f)(double x), +* double omega1,double omega2) +* Finds a solution x of the equation f(x)=y in the interval [x1,x2] +* to an absolute precision omega1 or a relative precision omega2 +* (whichever is reached first). The points x1,x2 must be such that +* f(x1) and f(x2) have different sign +* +* double minimize_fct(double x0,double x1,double x2,double (*f)(double x), +* double omega1,double omega2) +* Finds a local minimum x of f(x) in the interval [x0,x2] to an +* absolute precision omega1 or a relative precision omega2 (whichever +* is reached first). The point x1 is taken as an initial guess of the +* position of the minimum (x0 +#include +#include +#include +#include "utils.h" +#include "extras.h" + +static int nsv=0,isv; +static double *osv,*psv,**vsv,*xsv; +static double (*fsv)(int n,double *x); + + +static int relative_sign(double f1,double f2) +{ + if (((f1>=0.0)&&(f2<=0.0))||((f1<=0.0)&&(f2>=0.0))) + return 1; + else + return 0; +} + + +double inverse_fct(double x1,double x2,double (*f)(double x),double y, + double omega1,double omega2) +{ + double x3,f1,f2,f3,dx; + double lambda,eps; + + f1=f(x1)-y; + f2=f(x2)-y; + + error((x1>x2)||(relative_sign(f1,f2)==0),1,"inverse_fct [fsolve.c]", + "Improper bracket [x1,x2]"); + + eps=0.1; + omega2*=0.5; + dx=x2-x1; + + while ((dx>omega1)&&(dx>(omega2*(x1+x2)))) + { + if (fabs(f1)f1)&&(f2>f1)) + return 0; + + if ((f(y0)>f1)&&(f(y2)>f1)) + { + (*x0)=y0; + (*x2)=y2; + return 0; + } + + for (ic=1;ic<20;ic++) + { + if (f1>f2) + { + (*x2)=(*x1); + f2=f1; + } + + (*x1)=(*x0); + f1=f0; + + (*x0)+=d0; + + if ((*x0)f1)&&(f2>f1)) + return 0; + } + + (*x0)=y1+d0; + (*x1)=y1; + (*x2)=y1+d2; + + f0=f(*x0); + f1=f(*x1); + f2=f(*x2); + + for (ic=1;ic<20;ic++) + { + if (f0y2) + (*x2)=y2; + + f2=f(*x2); + + if ((f0>f1)&&(f2>f1)) + return 0; + } + + return 1; +} + + +static double mini_fct(double x0,double x1,double x2,double (*f)(double x), + double omega1,double omega2) +{ + double s,x3,f1,f2,dx; + + omega2*=0.5; + s=0.5*(3.0-sqrt(5.0)); + x3=x2; + dx=x3-x0; + f1=f(x1); + + if ((x1-x0)<(x3-x1)) + { + x2=x1+s*(x3-x1); + f2=f(x2); + } + else + { + x2=x1; + f2=f1; + x1=x2-s*(x2-x0); + f1=f(x1); + } + + while ((dx>omega1)&&(dx>(omega2*(fabs(x0)+fabs(x3))))) + { + if (f1=x2),1,"minimize_fct [fsolve.c]", + "Improper input values x0,x1,x2"); + + error(find_bracket(&x0,&x1,&x2,f),1,"minimize_fct [fsolve.c]", + "Unable to bracket minimum"); + + return mini_fct(x0,x1,x2,f,omega1,omega2); +} + + +static void alloc_arrays(int n) +{ + int i,j; + + if (nsv!=n) + { + if (nsv!=0) + { + afree(osv); + afree(vsv); + } + + if (n>0) + { + osv=amalloc(n*(n+3)*sizeof(*psv),3); + vsv=amalloc(n*sizeof(*vsv),3); + + error((osv==NULL)||(vsv==NULL),1,"alloc_arrays [fsolve.c]", + "Unable to allocate auxiliary arrays"); + + psv=osv+n; + vsv[0]=psv+n; + + for (i=1;i0.0) + { + (*r0)=(x0[k]-psv[k])/v[k]; + (*r2)=(x2[k]-psv[k])/v[k]; + } + else + { + (*r0)=(x2[k]-psv[k])/v[k]; + (*r2)=(x0[k]-psv[k])/v[k]; + } + + for (j=0;jpa) + (*rom2)=pa/va; + + if (v[j]>0.0) + { + if ((psv[j]+(*r0)*v[j])x2[j]) + (*r2)=(x2[j]-psv[j])/v[j]; + } + else + { + if ((psv[j]+(*r0)*v[j])>x2[j]) + (*r0)=(x2[j]-psv[j])/v[j]; + if ((psv[j]+(*r2)*v[j])=x1[j])||(x2[j]<=x1[j])) + ifn=1; + } + + error(ifn,1,"powell [fsolve.c]","Improper parameter arrays x0,x1,x2"); + error((imx<4)||((omega1<=0.0)&&(omega2<=0.0)),1,"powell [fsolve.c]", + "Improper parameters imx,omega1 or omega2"); + + fsv=f; + alloc_arrays(n); + + for (j=0;jdel) + { + del=r0; + k=isv; + } + + fp=fe; + } + else if (i>=2) + { + for (j=0;j=x2[j])) + ifn=1; + } + + if (ifn==0) + { + fe=f(n,xsv); + r0=fe-fo; + r1=fo-fp-del; + r2=2.0*(fo-2.0*fp+fe)*r1*r1-del*r0*r0; + + if ((r0<(-4.0*DBL_EPSILON*fabs(fo)))&&(r2<0.0)) + { + for (j=0;j0) + vsv[k][j]=vsv[0][j]; + vsv[0][j]=psv[j]-osv[j]; + } + } + } + + io1=1; + io2=1; + + for (j=0;jomega1) + io1=0; + if (r0>(omega2*psv[j])) + io2=0; + + osv[j]=psv[j]; + } + + fo=fp; + + if ((i>=3)&&((io1==1)||(io2==1))) + break; + } + + for (j=0;j=0. An error occurs if x is negative +* +* Notes: +* +* The Bessel function is calculated by evaluating the integral +* +* exp(-x)*I_0(x)=int_0^Pi (dt/Pi)*exp(-x*(1-cos(t))) +* +* using Chebyshev polynomials +* +*******************************************************************************/ + +#define I0M_C + +#include +#include +#include +#include +#include "utils.h" +#include "extras.h" + +static double pi,xs; + + +static double maxt(double x) +{ + double r; + + pi=4.0*atan(1.0); + + if (x<1.0) + return pi; + + r=1.0-(0.5*log(2.0*pi*x)-log(DBL_EPSILON))/x; + + if (r>=1.0) + return 0.0; + else if (r<=-1.0) + return pi; + else + return acos(r); +} + + +static double f(double t) +{ + return exp(-xs*(1.0-cos(t))); +} + + +double i0m(double x) +{ + double a,b; + + if (x==0.0) + return 1.0; + + error(x<0.0,1,"i0m [i0.c]","The argument x must be non-negative"); + + a=0.0; + b=maxt(x); + xs=x; + + if (b==0.0) + return (1.0/sqrt(2.0*pi*x))*(1.0+1.0/(8.0*x)); + + return cheby_int(a,b,f,512,10.0*DBL_EPSILON)/pi; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/ks_test.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/ks_test.c new file mode 100644 index 0000000000000000000000000000000000000000..f0651e97f44988fdb922532fa2697e479af487bd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/ks_test.c @@ -0,0 +1,147 @@ + +/******************************************************************************* +* +* File ks_test.c +* +* Copyright (C) 2005, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Kolmogorov-Smirnov test +* +* The externally accessible functions are +* +* void ks_test(int n,double f[],double *pkp,double *pkm) +* For a given array f[0],f[1],...,f[n-1], the program calculates +* the Kolmogorov-Smirnov statistics K_n^{+}=*pkp and K_n^{-}=*pkm +* +* void ks_prob(int n,double kp,double km,double *pp,double *pm) +* Computes the approximate probabilites *pp and *pm for the Kolmogorov- +* Smirnov statistics K_n^{+} and K_n^{-} to be less than or equal to +* kp and km respectively (eq.(4) in the notes). +* +* Notes: +* +* See the notes +* +* M. Luescher: Statistical tests +* +* for a detailed description. +* +*******************************************************************************/ + +#define KS_TEST_C + +#include +#include +#include +#include "extras.h" + + +void ks_test(int n, double f[],double *pkp,double *pkm) +{ + int *pn,k,i; + double *pu,*pv,xn,sn,x,kp,km; + + if (n<=0) + { + printf("Error in ks_test: argument out of range\n"); + printf("Program aborted\n\n"); + exit(0); + } + + pn=malloc((n+1)*sizeof(int)); + pu=malloc((n+1)*sizeof(double)); + pv=malloc((n+1)*sizeof(double)); + xn=(double)n; + + if (pn&&pu&&pv) + { + for (k=0;k<=n;k++) + { + pn[k]=0; + pu[k]=xn; + pv[k]=0.0; + } + } + else + { + printf("Error in ks_test: could not allocate auxiliary arrays\n"); + printf("Program aborted\n\n"); + exit(0); + } + + for (i=0;ixn)) + { + printf("Error in ks_test: argument out of range\n"); + printf("Program aborted\n\n"); + exit(0); + } + + k=(int)x; + pn[k]+=1; + if (xpv[k]) + pv[k]=x; + } + + sn=0.0; + kp=0.0; + km=0.0; + + for (k=0;k<=n;k++) + { + if (pn[k]>0) + { + x=pu[k]-sn; + if (x>km) + km=x; + sn+=(double)pn[k]; + x=sn-pv[k]; + if (x>kp) + kp=x; + } + } + + *pkp=kp/sqrt(xn); + *pkm=km/sqrt(xn); + + free(pn); + free(pu); + free(pv); +} + + +void ks_prob(int n,double kp,double km,double *pp,double *pm) +{ + double xn; + + if (n<=0) + { + printf("Error in ks_prob: argument out of range\n"); + printf("Program aborted\n\n"); + exit(0); + } + + xn=(double)n; + + if (kp<1e-8) + *pp=0.0; + else if (kp>3.5) + *pp=1.0; + else + *pp=1.0-exp(-2.0*kp*kp)*(1.0-2.0*kp/(3.0*sqrt(xn))); + + if (km<1e-8) + *pm=0.0; + else if (km>3.5) + *pm=1.0; + else + *pm=1.0-exp(-2.0*km*km)*(1.0-2.0*km/(3.0*sqrt(xn))); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/pchi_square.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/pchi_square.c new file mode 100644 index 0000000000000000000000000000000000000000..cdc28d9c0f8178d37d0e668a0c64c6cb0667d1ff --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/pchi_square.c @@ -0,0 +1,205 @@ + +/******************************************************************************* +* +* File pchi_square.c +* +* Copyright (C) 2005, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Chi-square probability distribution +* +* The externally accessible function is +* +* double pchi_square(double chi_square,int nu) +* For chi_square>=0 and nu=1,2,...,1000 the program returns an +* approximation for P(chi_square|nu) which deviates from the exact +* distribution by less than 10^(-8) [10^(-9) if nu=1] +* +* Notes: +* +* See the notes +* +* M. Luescher: Statistical tests +* +* for a detailed description. +* +*******************************************************************************/ + +#define PCHI_SQUARE_C + +#include +#include +#include +#include "extras.h" + +static int init=0; +static double c0,c1,c2,c3,c4,c5,lng[40]; +static double xd0,xd1,xd2,xd3,xd4,pi; + + +static void define_constants(void) +{ + int n; + double x; + + xd0=0.0; + xd1=1.0; + xd2=2.0; + xd3=3.0; + xd4=4.0; + + pi=xd4*atan(xd1); + + c0=-xd1/xd2; + c1=log(xd2*pi)/xd2; + c2= xd1/12.0; + c3=-xd1/360.0; + c4= xd1/1260.0; + c5=-xd1/1680.0; + + lng[1]=log(pi)/xd2; + lng[2]=xd0; + + for (n=3;n<40;++n) + { + x=(double)(n-2); + lng[n]=lng[n-2]+log(x/xd2); + } + + init=1; +} + + +static double ln_gamma(int k) +{ + double y,z,zm1,zm2; + + if (k<40) + return lng[k]; + + z=(double)k; + z=z/xd2; + zm1=xd1/z; + zm2=zm1*zm1; + + y=c5; + y=y*zm2+c4; + y=y*zm2+c3; + y=y*zm2+c2; + y=y*zm1+c1; + + return (z+c0)*log(z)-z+y; +} + + +static double pchi1(double chi_square) +{ + double x,y,z,a,p; + + x=chi_square; + + if (x<=1.0e-18) + return xd0; + if (x>=40.0) + return xd1; + + z=x/xd2; + a=xd2*sqrt(z/pi)*exp(-z); + + y=xd0; + z=xd3; + p=x/z; + + for (;a>(xd1-p)*1.0e-9;) + { + y+=a; + a*=p; + z+=xd2; + p=x/z; + } + return y; +} + + +static double pchi2(double chi_square,int nu) +{ + double x,y,z,xnu,lna,a,p; + + x=chi_square; + xnu=(double)nu; + + if (x<=1.0e-18) + return xd0; + + if (x<=xnu) + { + z=x/xd2; + lna=(xnu/xd2)*log(z)-z-ln_gamma(nu+2); + + z=xnu+xd2; + p=x/z; + y=xd0; + + if ((lna-log(xd1-p))<-18.5) + return y; + + a=exp(lna); + + for (;a>(xd1-p)*1.0e-8;) + { + y+=a; + a*=p; + z+=xd2; + p=x/z; + } + return y; + } + else + { + z=x/xd2; + lna=((xnu/xd2-xd1)*log(z)-z)-ln_gamma(nu); + + z=xnu-xd2; + p=z/x; + if (nu%2==1) + y=pchi1(x); + else + y=xd1; + + if ((lna-log(xd1-p))<-18.5) + return y; + + a=exp(lna); + + for (;(z>=xd0)&&(a>(xd1-p)*9.0e-9);) + { + y-=a; + a*=p; + z-=xd2; + p=z/x; + } + return y; + } +} + + +double pchi_square(double chi_square,int nu) +{ + if (init==0) + define_constants(); + + if ((nu<1)||(nu>1000)||(chi_square +* +* The computation of the autocorrelation function and the integrated +* autocorrelation time follows the lines of appendix A of +* +* M. L"uscher, Schwarz-preconditioned HMC algorithm for two-flavor +* lattice QCD, Comput. Phys. Commun. 165 (2005) 199 [hep-lat/0409106] +* +* In particular, the summation cutoff lambda is introduced there and +* the selection of the summation window *w is explained +* +* The programs in this module may be used in MPI programs, but should then +* only be called from the root process +* +*******************************************************************************/ + +#define STAT_C + +#include +#include +#include +#include +#include "utils.h" +#include "extras.h" + + +double average(int n,double *a) +{ + int i; + double abar; + + error_root(n<1,1,"average [stat.c]", + "Argument n is out of range (should be at least 1)"); + + abar=0.0; + + for (i=0;in),1,"auto_corr [stat.c]", + "Argument n or tmax is out of range"); + + abar=average(n,a); + g0=sigma0(n,a); + + if (g0<=(10.0*DBL_EPSILON*fabs(abar))) + { + g0=0.0; + + for (t=0;tn),1, + "sigma_auto_corr [stat.c]", + "Argument n, tmax or lambda is out of range"); + + g=amalloc(tmaxx*sizeof(*g),3); + error_root(g==NULL,1,"sigma_auto_corr [stat.c]", + "Unable to allocate auxiliary array"); + + auto_corr(n,a,tmaxx,g); + sigma_corr(n,tmax,lambda,g,eg); + + afree(g); +} + + +double tauint(int n,double *a,int tmax,int lambda,int *w,double *sigma) +{ + int t,tmaxx; + double tau,g0; + double *g,*eg; + + tmaxx=2*tmax+lambda-1; + + error_root((n<2)||(tmax<1)||(lambda<1)||(tmaxx>n),1,"tauint [stat.c]", + "Argument n, tmax or lambda is out of range"); + + g=amalloc(tmaxx*sizeof(*g),3); + eg=amalloc(tmax*sizeof(*eg),3); + + error_root((g==NULL)||(eg==NULL),1,"tauint [stat.c]", + "Unable to allocate auxiliary arrays"); + + g0=auto_corr(n,a,tmaxx,g); + sigma_corr(n,tmax,lambda,g,eg); + + tau=0.5; + (*w)=1; + (*sigma)=0.0; + + if (g0!=0.0) + { + for (t=1;t=8)&&(iw>=4)) + { + iw=0; + dw*=2; + } + + w+=dw; + iw+=1; + } + } + + printf("\n"); + afree(ga); + afree(ta); + + return abar; +} + + +static double javg(int nx,int n,double **a,double (*f)(int nx,double *x)) +{ + int i; + double *x,fbar; + + x=amalloc(nx*sizeof(*x),3); + error_root(x==NULL,1,"javg [stat.c]","Unable to allocate auxiliary array"); + + for (i=0;in),1,"jack_err [stat.c]", + "Argument nx,n or bmax is out of range"); + + b=amalloc(nx*sizeof(*b),3); + p=amalloc(nx*n*sizeof(*p),3); + + error_root((b==NULL)||(p==NULL),1,"jack_err [stat.c]", + "Unable to allocate auxiliary arrays"); + + for (i=0;i=8)&&(ibs>=4)) + { + ibs=0; + dbs*=2; + } + + bs+=dbs; + ibs+=1; + } + + printf("\n"); + afree(sig); + + return fbar; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/utils/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/utils/README new file mode 100644 index 0000000000000000000000000000000000000000..b6d13b3aef05227eea2a3bd989b2cc9e09e75834 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/utils/README @@ -0,0 +1,109 @@ + +******************************************************************************** + + Basic utility functions + +******************************************************************************** + + +Files +----- + +endian.c Byte swapping programs + +mutils.c Utility programs used in main programs + +utils.c Collection of basic utility programs + + +Include file +------------ + +The file utils.h defines the prototypes for all externally accessible +functions that are defined in the *.c files listed above. + + +List of functions +----------------- + +int endianness(void) + Returns LITTLE_ENDIAN if the machine is little endian and BIG_ENDIAN + if it is big endian. Otherwise the return value is UNKNOWN_ENDIAN + +void bswap_int(int n,void *a) + Inverts the byte order of the array elements a[0],..,a[n-1] + assuming these are 4 byte long + +void bswap_double(int n,void *a) + Inverts the byte order of the array elements a[0],..,a[n-1] + assuming these are 8 byte long + +int find_opt(int argc,char *argv[],char *opt) + Returns the position of the option opt in the array argv[]. Only + the elements argv[1],..,argv[argc-1] are scanned and 0 is returned + if opt is not found. + +int digits(double x,double dx,char *fmt) + Assuming x is a value with error dx, this program returns the number n + of fractional digits to print so that all significant digits plus two + more are shown. The print format fmt has to be "e" or "f" depending on + whether the number is to be printed using the "%.ne" or "%.nf" format + string. In the second case dx has to be in the range 0 +#include +#include "utils.h" + + +int endianness(void) +{ + stduint_t i; + unsigned char *b; + + i=0x04030201; + b=(unsigned char*)(&i); + + if ((b[0]==1u)&&(b[1]==2u)&&(b[2]==3u)&&(b[3]==4u)) + return LITTLE_ENDIAN; + else if ((b[0]==4u)&&(b[1]==3u)&&(b[2]==2u)&&(b[3]==1u)) + return BIG_ENDIAN; + else return UNKNOWN_ENDIAN; +} + + +void bswap_int(int n,void *a) +{ + unsigned char *ba,*bam,bas; + + ba=(unsigned char*)(a); + bam=ba+4*n; + + for (;ba +#include +#include +#include +#include +#include "utils.h" + +static char line[NAME_SIZE+1]; +static char inum[3*sizeof(int)+4]; + + +int find_opt(int argc,char *argv[],char *opt) +{ + int k; + + for (k=1;k=fabs(x)) + return 1; + else + return (int)(floor(1.0+log10(fabs(x)))-floor(log10(dx))); + } + else if (strcmp(fmt,"f")==0) + { + error((dx==0.0)||(dx>=1.0),1,"digits [mutils.c]", + "Improper input data (error out of range for fixed format)"); + + return (int)(1.0-floor(log10(dx))); + } + else + error(1,1,"digits [mutils.c]","Unknown data format"); + + return 0; +} + + +int fdigits(double x) +{ + int m,n,ne,k; + double y,z; + + if (x==0.0) + return 0; + + y=fabs(x); + z=DBL_EPSILON*y; + m=floor(log10(y+z)); + n=0; + ne=1; + + for (k=0;k<(DBL_DIG-m);k++) + { + z=sqrt((double)(ne))*DBL_EPSILON*y; + + if (((y-floor(y))<=z)||((ceil(y)-y)<=z)) + break; + + y*=10.0; + ne+=1; + n+=1; + } + + return n; +} + + +int name_size(char *format,...) +{ + int nlen,ie,n; + double dmy; + char *pp,*pc; + va_list args; + + va_start(args,format); + pc=format; + nlen=strlen(format); + ie=0; + n=0; + + for (;;) + { + pp=strchr(pc,'%'); + + if (pp==NULL) + break; + + pc=pp+1; + + if (pc[0]=='s') + nlen+=(strlen(va_arg(args,char*))-2); + else if (pc[0]=='d') + { + sprintf(inum,"%d",va_arg(args,int)); + nlen+=(strlen(inum)-2); + } + else if (pc[0]=='.') + { + if (sscanf(pc,".%d",&n)!=1) + { + ie=1; + break; + } + + sprintf(inum,".%df",n); + pp=strstr(pc,inum); + + if (pp!=pc) + { + ie=2; + break; + } + + nlen+=(n+1-strlen(inum)); + dmy=va_arg(args,double); + if (dmy<0.0) + nlen+=1; + } + else + { + ie=3; + break; + } + } + + va_end(args); + error(ie!=0,1,"name_size [mutils.c]", + "Incorrect format string %s (ie=%d)",format,ie); + return nlen; +} + + +static int cmp_text(char *text1,char *text2) +{ + size_t n1,n2; + char *p1,*p2; + + p1=text1; + p2=text2; + + while (1) + { + p1+=strspn(p1," \t\n"); + p2+=strspn(p2," \t\n"); + n1=strcspn(p1," \t\n"); + n2=strcspn(p2," \t\n"); + + if (n1!=n2) + return 0; + if (n1==0) + return 1; + if (strncmp(p1,p2,n1)!=0) + return 0; + + p1+=n1; + p2+=n1; + } +} + + +static char *get_line(FILE *stream) +{ + char *s,*c; + + s=fgets(line,NAME_SIZE+1,stream); + + if (s!=NULL) + { + error(strlen(line)==NAME_SIZE,1,"get_line [mutils.c]", + "Input line is longer than NAME_SIZE-1"); + + c=strchr(line,'#'); + if (c!=NULL) + c[0]='\0'; + } + + return s; +} + + +long find_section(FILE *stream,char *title) +{ + int ie; + long ofs,sofs; + char *s,*pl,*pr; + + + rewind(stream); + sofs=-1L; + ofs=ftell(stream); + s=get_line(stream); + + while (s!=NULL) + { + pl=strchr(line,'['); + pr=strchr(line,']'); + + if ((pl==(line+strspn(line," \t")))&&(pr>pl)) + { + pl+=1; + pr[0]='\0'; + + if (cmp_text(pl,title)==1) + { + error(sofs>=0L,1,"find_section [mutils.c]", + "Section [%s] occurs more than once",title); + sofs=ofs; + } + } + + ofs=ftell(stream); + s=get_line(stream); + } + + error(sofs==-1L,1,"find_section [mutils.c]", + "Section [%s] not found",title); + ie=fseek(stream,sofs,SEEK_SET); + error(ie!=0,1,"find_section [mutils.c]", + "Unable to go to section [%s]",title); + get_line(stream); + + return sofs; +} + + +static void check_tag(char *tag) +{ + if (tag[0]=='\0') + return; + + error((strspn(tag," 0123456789.")!=0L)|| + (strcspn(tag," \n")!=strlen(tag)),1, + "check_tag [mutils.c]","Improper tag %s",tag); +} + + +static long find_tag(FILE *stream,char *tag) +{ + int ie; + long tofs,lofs,ofs; + char *s,*pl,*pr; + + ie=0; + tofs=-1L; + lofs=ftell(stream); + rewind(stream); + ofs=ftell(stream); + s=get_line(stream); + + while (s!=NULL) + { + pl=strchr(line,'['); + pr=strchr(line,']'); + + if ((pl==(line+strspn(line," \t")))&&(pr>pl)) + { + if (ofs +#include +#include +#include +#include "utils.h" + +struct addr_t +{ + char *addr; + char *true_addr; + struct addr_t *last,*next; +}; + +static struct addr_t *rpos=NULL; + + +int safe_mod(int x,int y) +{ + if (x>=0) + return(x%y); + else + return((y-(abs(x)%y))%y); +} + + +void *amalloc(size_t size,int p) +{ + int shift; + char *true_addr,*addr; + unsigned long mask; + struct addr_t *new,*rnxt; + + if ((size<=0)||(p<0)) + return(NULL); + + shift=1<=rlxd_size() + +void rlxd_reset(int state[]) + Resets the generator to the state defined by the array state[N] + +void ranlxs(float r[],int n) + Computes the next n single-precision random numbers and + assigns them to the elements r[0],...,r[n-1] of the array r[] + +void rlxs_init(int level,int seed) + Initialization of the generator + +int rlxs_size(void) + Returns the number of integers required to save the state of + the generator + +void rlxs_get(int state[]) + Extracts the current state of the generator and stores the + information in the array state[N] where N>=rlxs_size() + +void rlxs_reset(int state[]) + Resets the generator to the state defined by the array state[N] diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/random/gauss.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/random/gauss.c new file mode 100644 index 0000000000000000000000000000000000000000..463cd662b27531d468526a36d8f51057f9086fad --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/random/gauss.c @@ -0,0 +1,97 @@ + +/******************************************************************************* +* +* File gauss.c +* +* Copyright (C) 2005 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Generation of Gaussian random numbers +* +* The externally accessible functions are +* +* void gauss(float r[],int n) +* Generates n single-precision Gaussian random numbers x with distribution +* proportional to exp(-x^2) and assigns them to r[0],..,r[n-1] +* +* void gauss_dble(double rd[],int n) +* Generates n double-precision Gaussian random numbers x with distribution +* proportional to exp(-x^2) and assigns them to rd[0],..,rd[n-1] +* +*******************************************************************************/ + +#define GAUSS_C + +#include +#include +#include +#include "utils.h" +#include "random.h" + +static int init=0; +static double twopi; + + +void gauss(float r[],int n) +{ + int k; + float u[2]; + double x1,x2,rho,y1,y2; + + if (init==0) + { + twopi=8.0*atan(1.0); + init=1; + } + + for (k=0;k +#include +#include +#include "mpi.h" +#include "utils.h" +#include "lattice.h" +#include "random.h" +#include "global.h" + +static int *rlxs_state=NULL,*rlxd_state; +static stdint_t *state; + + +static int check_machine(void) +{ + int ie; + + error_root(sizeof(stdint_t)!=4,1,"check_machine [ranlux.c]", + "Size of a stdint_t integer is not 4"); + + ie=endianness(); + error_root(ie==UNKNOWN_ENDIAN,1,"check_machine [ranlux.c]", + "Unkown endianness"); + + return ie; +} + + +static int alloc_state(void) +{ + int nlxs,nlxd,n; + + nlxs=rlxs_size(); + nlxd=rlxd_size(); + n=nlxs+nlxd; + + if (rlxs_state==NULL) + { + rlxs_state=malloc(n*sizeof(int)); + rlxd_state=rlxs_state+nlxs; + state=malloc(n*sizeof(stdint_t)); + error((rlxs_state==NULL)||(state==NULL),1,"alloc_state [ranlux.c]", + "Unable to allocate state arrays"); + } + + return n; +} + + +static int get_ip(int n) +{ + int np[4]; + + np[3]=n%NPROC3; + n/=NPROC3; + np[2]=n%NPROC2; + n/=NPROC2; + np[1]=n%NPROC1; + n/=NPROC1; + np[0]=n; + + return ipr_global(np); +} + + +static void get_state(void) +{ + rlxs_get(rlxs_state); + rlxd_get(rlxd_state); +} + + +static void reset_state(void) +{ + rlxs_reset(rlxs_state); + rlxd_reset(rlxd_state); +} + + +void start_ranlux(int level,int seed) +{ + int my_rank,max_seed,loc_seed; + int n,iprms[2]; + + if (NPROC>1) + { + iprms[0]=level; + iprms[1]=seed; + + MPI_Bcast(iprms,2,MPI_INT,0,MPI_COMM_WORLD); + + error((iprms[0]!=level)||(iprms[1]!=seed),1, + "start_ranlux [ranlux.c]","Input parameters are not global"); + } + + max_seed=INT_MAX/NPROC; + + error_root((level<0)||(level>1)||(seed<1)||(seed>max_seed),1, + "start_ranlux [ranlux.c]","Parameters are out of range"); + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + loc_seed=0; + + for (n=0;n0) + { + if (my_rank==0) + { + MPI_Send(&dmy,1,MPI_INT,ip,tag0,MPI_COMM_WORLD); + MPI_Recv(rlxs_state,ns,MPI_INT,ip,tag1,MPI_COMM_WORLD,&stat); + } + else if (my_rank==ip) + { + get_state(); + MPI_Recv(&dmy,1,MPI_INT,0,tag0,MPI_COMM_WORLD,&stat); + MPI_Send(rlxs_state,ns,MPI_INT,0,tag1,MPI_COMM_WORLD); + } + } + else if (my_rank==0) + get_state(); + + if (my_rank==0) + { + for (k=0;k0) + { + if (my_rank==0) + { + MPI_Send(rlxs_state,ns,MPI_INT,ip,tag1,MPI_COMM_WORLD); + MPI_Recv(&dmy,1,MPI_INT,ip,tag0,MPI_COMM_WORLD,&stat); + } + else if (my_rank==ip) + { + MPI_Recv(rlxs_state,ns,MPI_INT,0,tag1,MPI_COMM_WORLD,&stat); + MPI_Send(&dmy,1,MPI_INT,0,tag0,MPI_COMM_WORLD); + reset_state(); + } + } + else if (my_rank==0) + reset_state(); + } + + error_chk(); + + if (my_rank==0) + { + n=(int)(lsize[0]); + error_root(ir!=(9+NPROC*ns),1,"import_ranlux [ranlux.c]", + "Incorrect read count"); + fclose(fin); + } + + MPI_Bcast(&n,1,MPI_INT,0,MPI_COMM_WORLD); + + return n; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/random/ranlxd.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/random/ranlxd.c new file mode 100644 index 0000000000000000000000000000000000000000..5c9cee3a0c6bcda3029a52728c2641ff8fae5a3e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/random/ranlxd.c @@ -0,0 +1,610 @@ + +/******************************************************************************* +* +* File ranlxd.c +* +* Copyright (C) 2005, 2008, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Random number generator "ranlxd" version 3.0. See the notes +* +* "User's guide for ranlxs and ranlxd v3.0" (May 2001) +* +* "Algorithms used in ranlux v3.0" (May 2001) +* +* for a detailed description. +* +* The externally accessible functions are +* +* void ranlxd(double r[],int n) +* Computes the next n double-precision random numbers and +* assigns them to the elements r[0],...,r[n-1] of the array r[] +* +* void rlxd_init(int level,int seed) +* Initialization of the generator +* +* int rlxd_size(void) +* Returns the number of integers required to save the state of +* the generator +* +* void rlxd_get(int state[]) +* Extracts the current state of the generator and stores the +* information in the array state[N] where N>=rlxd_size() +* +* void rlxd_reset(int state[]) +* Resets the generator to the state defined by the array state[N] +* +*******************************************************************************/ + +#define RANLXD_C + +#include +#include +#include +#include +#include +#include "utils.h" +#include "random.h" + +#if (defined x64) + +typedef struct +{ + float c1,c2,c3,c4; +} vec_t __attribute__ ((aligned (16))); + +typedef struct +{ + vec_t c1,c2; +} dble_vec_t __attribute__ ((aligned (16))); + +static int init=0,pr,prm,ir,jr,is,is_old,next[96]; +static vec_t one,one_bit,carry; + +static union +{ + dble_vec_t vec[12]; + float num[96]; +} x __attribute__ ((aligned (16))); + +#define STEP(pi,pj) \ + __asm__ __volatile__ ("movaps %4, %%xmm4 \n\t" \ + "movaps %%xmm2, %%xmm3 \n\t" \ + "subps %2, %%xmm4 \n\t" \ + "movaps %%xmm1, %%xmm5 \n\t" \ + "cmpps $0x6, %%xmm4, %%xmm2 \n\t" \ + "andps %%xmm2, %%xmm5 \n\t" \ + "subps %%xmm3, %%xmm4 \n\t" \ + "andps %%xmm0, %%xmm2 \n\t" \ + "addps %%xmm4, %%xmm5 \n\t" \ + "movaps %%xmm5, %0 \n\t" \ + "movaps %5, %%xmm6 \n\t" \ + "movaps %%xmm2, %%xmm3 \n\t" \ + "subps %3, %%xmm6 \n\t" \ + "movaps %%xmm1, %%xmm7 \n\t" \ + "cmpps $0x6, %%xmm6, %%xmm2 \n\t" \ + "andps %%xmm2, %%xmm7 \n\t" \ + "subps %%xmm3, %%xmm6 \n\t" \ + "andps %%xmm0, %%xmm2 \n\t" \ + "addps %%xmm6, %%xmm7 \n\t" \ + "movaps %%xmm7, %1" \ + : \ + "=m" ((*pi).c1), \ + "=m" ((*pi).c2) \ + : \ + "m" ((*pi).c1), \ + "m" ((*pi).c2), \ + "m" ((*pj).c1), \ + "m" ((*pj).c2) \ + : \ + "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7") + + +static void update(void) +{ + int k,kmax; + dble_vec_t *pmin,*pmax,*pi,*pj; + + kmax=pr; + pmin=&x.vec[0]; + pmax=pmin+12; + pi=&x.vec[ir]; + pj=&x.vec[jr]; + + __asm__ __volatile__ ("movaps %0, %%xmm0 \n\t" + "movaps %1, %%xmm1 \n\t" + "movaps %2, %%xmm2" + : + : + "m" (one_bit), + "m" (one), + "m" (carry) + : + "xmm0", "xmm1", "xmm2"); + + for (k=0;k=12) + ir-=12; + if (jr>=12) + jr-=12; + is=8*ir; + is_old=is; +} + + +static void define_constants(void) +{ + int k; + float b; + + one.c1=1.0f; + one.c2=1.0f; + one.c3=1.0f; + one.c4=1.0f; + + b=(float)(ldexp(1.0,-24)); + one_bit.c1=b; + one_bit.c2=b; + one_bit.c3=b; + one_bit.c4=b; + + for (k=0;k<96;k++) + { + next[k]=(k+1)%96; + if ((k%4)==3) + next[k]=(k+5)%96; + } +} + + +void rlxd_init(int level,int seed) +{ + int i,k,l; + int ibit,jbit,xbit[31]; + int ix,iy; + + define_constants(); + + error_loc((level<1)||(level>2),1,"rlxd_init [ranlxd.c]", + "Bad choice of luxury level (should be 1 or 2)"); + + if (level==1) + pr=202; + else if (level==2) + pr=397; + + i=seed; + + for (k=0;k<31;k++) + { + xbit[k]=i%2; + i/=2; + } + + error_loc((seed<=0)||(i!=0),1,"rlxd_init [ranlxd.c]", + "Bad choice of seed (should be between 1 and 2^31-1)"); + + ibit=0; + jbit=18; + + for (i=0;i<4;i++) + { + for (k=0;k<24;k++) + { + ix=0; + + for (l=0;l<24;l++) + { + iy=xbit[ibit]; + ix=2*ix+iy; + + xbit[ibit]=(xbit[ibit]+xbit[jbit])%2; + ibit=(ibit+1)%31; + jbit=(jbit+1)%31; + } + + if ((k%4)!=i) + ix=16777215-ix; + + x.num[4*k+i]=(float)(ldexp((double)(ix),-24)); + } + } + + carry.c1=0.0f; + carry.c2=0.0f; + carry.c3=0.0f; + carry.c4=0.0f; + + ir=0; + jr=7; + is=91; + is_old=0; + prm=pr%12; + init=1; +} + + +void ranlxd(double r[],int n) +{ + int k; + + if (init==0) + rlxd_init(1,1); + + for (k=0;k=167777216),1, + "rlxd_reset [ranlxd.c]","Unexpected input data"); + + x.num[k]=(float)(ldexp((double)(state[k+1]),-24)); + } + + error_loc(((state[97]!=0)&&(state[97]!=1))|| + ((state[98]!=0)&&(state[98]!=1))|| + ((state[99]!=0)&&(state[99]!=1))|| + ((state[100]!=0)&&(state[100]!=1)),1, + "rlxd_reset [ranlxd.c]","Unexpected input data"); + + carry.c1=(float)(ldexp((double)(state[97]),-24)); + carry.c2=(float)(ldexp((double)(state[98]),-24)); + carry.c3=(float)(ldexp((double)(state[99]),-24)); + carry.c4=(float)(ldexp((double)(state[100]),-24)); + + pr=state[101]; + ir=state[102]; + jr=state[103]; + is=state[104]; + is_old=8*ir; + prm=pr%12; + init=1; + + error_loc(((pr!=202)&&(pr!=397))|| + (ir<0)||(ir>11)||(jr<0)||(jr>11)||(jr!=((ir+7)%12))|| + (is<0)||(is>91),1, + "rlxd_reset [ranlxd.c]","Unexpected input data"); +} + +#else + +#define BASE 0x1000000 +#define MASK 0xffffff + +typedef struct +{ + int c1,c2,c3,c4; +} vec_t; + +typedef struct +{ + vec_t c1,c2; +} dble_vec_t; + +static int init=0,pr,prm,ir,jr,is,is_old,next[96]; +static double one_bit; +static vec_t carry; + +static union +{ + dble_vec_t vec[12]; + int num[96]; +} x; + +#define STEP(pi,pj) \ + d=(*pj).c1.c1-(*pi).c1.c1-carry.c1; \ + (*pi).c2.c1+=(d<0); \ + d+=BASE; \ + (*pi).c1.c1=d&MASK; \ + d=(*pj).c1.c2-(*pi).c1.c2-carry.c2; \ + (*pi).c2.c2+=(d<0); \ + d+=BASE; \ + (*pi).c1.c2=d&MASK; \ + d=(*pj).c1.c3-(*pi).c1.c3-carry.c3; \ + (*pi).c2.c3+=(d<0); \ + d+=BASE; \ + (*pi).c1.c3=d&MASK; \ + d=(*pj).c1.c4-(*pi).c1.c4-carry.c4; \ + (*pi).c2.c4+=(d<0); \ + d+=BASE; \ + (*pi).c1.c4=d&MASK; \ + d=(*pj).c2.c1-(*pi).c2.c1; \ + carry.c1=(d<0); \ + d+=BASE; \ + (*pi).c2.c1=d&MASK; \ + d=(*pj).c2.c2-(*pi).c2.c2; \ + carry.c2=(d<0); \ + d+=BASE; \ + (*pi).c2.c2=d&MASK; \ + d=(*pj).c2.c3-(*pi).c2.c3; \ + carry.c3=(d<0); \ + d+=BASE; \ + (*pi).c2.c3=d&MASK; \ + d=(*pj).c2.c4-(*pi).c2.c4; \ + carry.c4=(d<0); \ + d+=BASE; \ + (*pi).c2.c4=d&MASK + + +static void update(void) +{ + int k,kmax,d; + dble_vec_t *pmin,*pmax,*pi,*pj; + + kmax=pr; + pmin=&x.vec[0]; + pmax=pmin+12; + pi=&x.vec[ir]; + pj=&x.vec[jr]; + + for (k=0;k=12) + ir-=12; + if (jr>=12) + jr-=12; + is=8*ir; + is_old=is; +} + + +static void define_constants(void) +{ + int k; + + one_bit=ldexp(1.0,-24); + + for (k=0;k<96;k++) + { + next[k]=(k+1)%96; + if ((k%4)==3) + next[k]=(k+5)%96; + } +} + + +void rlxd_init(int level,int seed) +{ + int i,k,l; + int ibit,jbit,xbit[31]; + int ix,iy; + + error_loc((INT_MAX<2147483647)||(FLT_RADIX!=2)||(FLT_MANT_DIG<24)|| + (DBL_MANT_DIG<48),1,"rlxd_init [ranlxd.c]", + "Arithmetic on this machine is not suitable for ranlxd"); + + define_constants(); + + error_loc((level<1)||(level>2),1,"rlxd_init [ranlxd.c]", + "Bad choice of luxury level (should be 1 or 2)"); + + if (level==1) + pr=202; + else if (level==2) + pr=397; + + i=seed; + + for (k=0;k<31;k++) + { + xbit[k]=i%2; + i/=2; + } + + error_loc((seed<=0)||(i!=0),1,"rlxd_init [ranlxd.c]", + "Bad choice of seed (should be between 1 and 2^31-1)"); + + ibit=0; + jbit=18; + + for (i=0;i<4;i++) + { + for (k=0;k<24;k++) + { + ix=0; + + for (l=0;l<24;l++) + { + iy=xbit[ibit]; + ix=2*ix+iy; + + xbit[ibit]=(xbit[ibit]+xbit[jbit])%2; + ibit=(ibit+1)%31; + jbit=(jbit+1)%31; + } + + if ((k%4)!=i) + ix=16777215-ix; + + x.num[4*k+i]=ix; + } + } + + carry.c1=0; + carry.c2=0; + carry.c3=0; + carry.c4=0; + + ir=0; + jr=7; + is=91; + is_old=0; + prm=pr%12; + init=1; +} + + +void ranlxd(double r[],int n) +{ + int k; + + if (init==0) + rlxd_init(1,1); + + for (k=0;k=167777216),1, + "rlxd_reset [ranlxd.c]","Unexpected input data"); + + x.num[k]=state[k+1]; + } + + error_loc(((state[97]!=0)&&(state[97]!=1))|| + ((state[98]!=0)&&(state[98]!=1))|| + ((state[99]!=0)&&(state[99]!=1))|| + ((state[100]!=0)&&(state[100]!=1)),1, + "rlxd_reset [ranlxd.c]","Unexpected input data"); + + carry.c1=state[97]; + carry.c2=state[98]; + carry.c3=state[99]; + carry.c4=state[100]; + + pr=state[101]; + ir=state[102]; + jr=state[103]; + is=state[104]; + is_old=8*ir; + prm=pr%12; + init=1; + + error_loc(((pr!=202)&&(pr!=397))|| + (ir<0)||(ir>11)||(jr<0)||(jr>11)||(jr!=((ir+7)%12))|| + (is<0)||(is>91),1, + "rlxd_reset [ranlxd.c]","Unexpected input data"); +} + +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/random/ranlxs.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/random/ranlxs.c new file mode 100644 index 0000000000000000000000000000000000000000..10752f3fd667a21fb16e2a298bd73b020d087a53 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/random/ranlxs.c @@ -0,0 +1,605 @@ + +/******************************************************************************* +* +* File ranlxs.c +* +* Copyright (C) 2005, 2008, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Random number generator "ranlxs" version 3.0. See the notes +* +* "User's guide for ranlxs and ranlxd v3.0" (May 2001) +* +* "Algorithms used in ranlux v3.0" (May 2001) +* +* for a detailed description. +* +* The externally accessible functions are +* +* void ranlxs(float r[],int n) +* Computes the next n single-precision random numbers and +* assigns them to the elements r[0],...,r[n-1] of the array r[] +* +* void rlxs_init(int level,int seed) +* Initialization of the generator +* +* int rlxs_size(void) +* Returns the number of integers required to save the state of +* the generator +* +* void rlxs_get(int state[]) +* Extracts the current state of the generator and stores the +* information in the array state[N] where N>=rlxs_size() +* +* void rlxs_reset(int state[]) +* Resets the generator to the state defined by the array state[N] +* +*******************************************************************************/ + +#define RANLXS_C + +#include +#include +#include +#include +#include +#include "utils.h" +#include "random.h" + +#if (defined x64) + +typedef struct +{ + float c1,c2,c3,c4; +} vec_t __attribute__ ((aligned (16))); + +typedef struct +{ + vec_t c1,c2; +} dble_vec_t __attribute__ ((aligned (16))); + +static int init=0,pr,prm,ir,jr,is,is_old,next[96]; +static vec_t one,one_bit,carry; + +static union +{ + dble_vec_t vec[12]; + float num[96]; +} x __attribute__ ((aligned (16))); + +#define STEP(pi,pj) \ + __asm__ __volatile__ ("movaps %4, %%xmm4 \n\t" \ + "movaps %%xmm2, %%xmm3 \n\t" \ + "subps %2, %%xmm4 \n\t" \ + "movaps %%xmm1, %%xmm5 \n\t" \ + "cmpps $0x6, %%xmm4, %%xmm2 \n\t" \ + "andps %%xmm2, %%xmm5 \n\t" \ + "subps %%xmm3, %%xmm4 \n\t" \ + "andps %%xmm0, %%xmm2 \n\t" \ + "addps %%xmm4, %%xmm5 \n\t" \ + "movaps %%xmm5, %0 \n\t" \ + "movaps %5, %%xmm6 \n\t" \ + "movaps %%xmm2, %%xmm3 \n\t" \ + "subps %3, %%xmm6 \n\t" \ + "movaps %%xmm1, %%xmm7 \n\t" \ + "cmpps $0x6, %%xmm6, %%xmm2 \n\t" \ + "andps %%xmm2, %%xmm7 \n\t" \ + "subps %%xmm3, %%xmm6 \n\t" \ + "andps %%xmm0, %%xmm2 \n\t" \ + "addps %%xmm6, %%xmm7 \n\t" \ + "movaps %%xmm7, %1" \ + : \ + "=m" ((*pi).c1), \ + "=m" ((*pi).c2) \ + : \ + "m" ((*pi).c1), \ + "m" ((*pi).c2), \ + "m" ((*pj).c1), \ + "m" ((*pj).c2) \ + : \ + "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7") + + +static void update(void) +{ + int k,kmax; + dble_vec_t *pmin,*pmax,*pi,*pj; + + kmax=pr; + pmin=&x.vec[0]; + pmax=pmin+12; + pi=&x.vec[ir]; + pj=&x.vec[jr]; + + __asm__ __volatile__ ("movaps %0, %%xmm0 \n\t" + "movaps %1, %%xmm1 \n\t" + "movaps %2, %%xmm2" + : + : + "m" (one_bit), + "m" (one), + "m" (carry) + : + "xmm0", "xmm1", "xmm2"); + + for (k=0;k=12) + ir-=12; + if (jr>=12) + jr-=12; + is=8*ir; + is_old=is; +} + + +static void define_constants(void) +{ + int k; + float b; + + one.c1=1.0f; + one.c2=1.0f; + one.c3=1.0f; + one.c4=1.0f; + + b=(float)(ldexp(1.0,-24)); + one_bit.c1=b; + one_bit.c2=b; + one_bit.c3=b; + one_bit.c4=b; + + for (k=0;k<96;k++) + next[k]=(k+1)%96; +} + + +void rlxs_init(int level,int seed) +{ + int i,k,l; + int ibit,jbit,xbit[31]; + int ix,iy; + + define_constants(); + + error_loc((level<0)||(level>2),1,"rlxs_init [ranlxs.c]", + "Bad choice of luxury level (should be 0,1 or 2)"); + + if (level==0) + pr=109; + else if (level==1) + pr=202; + else if (level==2) + pr=397; + + i=seed; + + for (k=0;k<31;k++) + { + xbit[k]=i%2; + i/=2; + } + + error_loc((seed<=0)||(i!=0),1,"rlxs_init [ranlxs.c]", + "Bad choice of seed (should be between 1 and 2^31-1)"); + + ibit=0; + jbit=18; + + for (i=0;i<4;i++) + { + for (k=0;k<24;k++) + { + ix=0; + + for (l=0;l<24;l++) + { + iy=xbit[ibit]; + ix=2*ix+iy; + + xbit[ibit]=(xbit[ibit]+xbit[jbit])%2; + ibit=(ibit+1)%31; + jbit=(jbit+1)%31; + } + + if ((k%4)==i) + ix=16777215-ix; + + x.num[4*k+i]=(float)(ldexp((double)(ix),-24)); + } + } + + carry.c1=0.0f; + carry.c2=0.0f; + carry.c3=0.0f; + carry.c4=0.0f; + + ir=0; + jr=7; + is=95; + is_old=0; + prm=pr%12; + init=1; +} + + +void ranlxs(float r[],int n) +{ + int k; + + if (init==0) + rlxs_init(0,1); + + for (k=0;k=167777216),1, + "rlxs_reset [ranlxs.c]","Unexpected input data"); + + x.num[k]=(float)(ldexp((double)(state[k+1]),-24)); + } + + error_loc(((state[97]!=0)&&(state[97]!=1))|| + ((state[98]!=0)&&(state[98]!=1))|| + ((state[99]!=0)&&(state[99]!=1))|| + ((state[100]!=0)&&(state[100]!=1)),1, + "rlxs_reset [ranlxs.c]","Unexpected input data"); + + carry.c1=(float)(ldexp((double)(state[97]),-24)); + carry.c2=(float)(ldexp((double)(state[98]),-24)); + carry.c3=(float)(ldexp((double)(state[99]),-24)); + carry.c4=(float)(ldexp((double)(state[100]),-24)); + + pr=state[101]; + ir=state[102]; + jr=state[103]; + is=state[104]; + is_old=8*ir; + prm=pr%12; + init=1; + + error_loc(((pr!=109)&&(pr!=202)&&(pr!=397))|| + (ir<0)||(ir>11)||(jr<0)||(jr>11)||(jr!=((ir+7)%12))|| + (is<0)||(is>95),1, + "rlxs_reset [ranlxs.c]","Unexpected input data"); +} + +#else + +#define BASE 0x1000000 +#define MASK 0xffffff + +typedef struct +{ + int c1,c2,c3,c4; +} vec_t; + +typedef struct +{ + vec_t c1,c2; +} dble_vec_t; + +static int init=0,pr,prm,ir,jr,is,is_old,next[96]; +static float one_bit; +static vec_t carry; + +static union +{ + dble_vec_t vec[12]; + int num[96]; +} x; + +#define STEP(pi,pj) \ + d=(*pj).c1.c1-(*pi).c1.c1-carry.c1; \ + (*pi).c2.c1+=(d<0); \ + d+=BASE; \ + (*pi).c1.c1=d&MASK; \ + d=(*pj).c1.c2-(*pi).c1.c2-carry.c2; \ + (*pi).c2.c2+=(d<0); \ + d+=BASE; \ + (*pi).c1.c2=d&MASK; \ + d=(*pj).c1.c3-(*pi).c1.c3-carry.c3; \ + (*pi).c2.c3+=(d<0); \ + d+=BASE; \ + (*pi).c1.c3=d&MASK; \ + d=(*pj).c1.c4-(*pi).c1.c4-carry.c4; \ + (*pi).c2.c4+=(d<0); \ + d+=BASE; \ + (*pi).c1.c4=d&MASK; \ + d=(*pj).c2.c1-(*pi).c2.c1; \ + carry.c1=(d<0); \ + d+=BASE; \ + (*pi).c2.c1=d&MASK; \ + d=(*pj).c2.c2-(*pi).c2.c2; \ + carry.c2=(d<0); \ + d+=BASE; \ + (*pi).c2.c2=d&MASK; \ + d=(*pj).c2.c3-(*pi).c2.c3; \ + carry.c3=(d<0); \ + d+=BASE; \ + (*pi).c2.c3=d&MASK; \ + d=(*pj).c2.c4-(*pi).c2.c4; \ + carry.c4=(d<0); \ + d+=BASE; \ + (*pi).c2.c4=d&MASK + + +static void update(void) +{ + int k,kmax,d; + dble_vec_t *pmin,*pmax,*pi,*pj; + + kmax=pr; + pmin=&x.vec[0]; + pmax=pmin+12; + pi=&x.vec[ir]; + pj=&x.vec[jr]; + + for (k=0;k=12) + ir-=12; + if (jr>=12) + jr-=12; + is=8*ir; + is_old=is; +} + + +static void define_constants(void) +{ + int k; + + one_bit=(float)(ldexp(1.0,-24)); + + for (k=0;k<96;k++) + next[k]=(k+1)%96; +} + + +void rlxs_init(int level,int seed) +{ + int i,k,l; + int ibit,jbit,xbit[31]; + int ix,iy; + + error_loc((INT_MAX<2147483647)||(FLT_RADIX!=2)||(FLT_MANT_DIG<24),1, + "rlxs_init [ranlxs.c]", + "Arithmetic on this machine is not suitable for ranlxs"); + + define_constants(); + + error_loc((level<0)||(level>2),1,"rlxs_init [ranlxs.c]", + "Bad choice of luxury level (should be 0,1 or 2)"); + + if (level==0) + pr=109; + else if (level==1) + pr=202; + else if (level==2) + pr=397; + + i=seed; + + for (k=0;k<31;k++) + { + xbit[k]=i%2; + i/=2; + } + + error_loc((seed<=0)||(i!=0),1,"rlxs_init [ranlxs.c]", + "Bad choice of seed (should be between 1 and 2^31-1)"); + + ibit=0; + jbit=18; + + for (i=0;i<4;i++) + { + for (k=0;k<24;k++) + { + ix=0; + + for (l=0;l<24;l++) + { + iy=xbit[ibit]; + ix=2*ix+iy; + + xbit[ibit]=(xbit[ibit]+xbit[jbit])%2; + ibit=(ibit+1)%31; + jbit=(jbit+1)%31; + } + + if ((k%4)==i) + ix=16777215-ix; + + x.num[4*k+i]=ix; + } + } + + carry.c1=0; + carry.c2=0; + carry.c3=0; + carry.c4=0; + + ir=0; + jr=7; + is=95; + is_old=0; + prm=pr%12; + init=1; +} + + +void ranlxs(float r[],int n) +{ + int k; + + if (init==0) + rlxs_init(0,1); + + for (k=0;k=167777216),1, + "rlxs_reset [ranlxs.c]","Unexpected input data"); + + x.num[k]=state[k+1]; + } + + error_loc(((state[97]!=0)&&(state[97]!=1))|| + ((state[98]!=0)&&(state[98]!=1))|| + ((state[99]!=0)&&(state[99]!=1))|| + ((state[100]!=0)&&(state[100]!=1)),1, + "rlxs_reset [ranlxs.c]","Unexpected input data"); + + carry.c1=state[97]; + carry.c2=state[98]; + carry.c3=state[99]; + carry.c4=state[100]; + + pr=state[101]; + ir=state[102]; + jr=state[103]; + is=state[104]; + is_old=8*ir; + prm=pr%12; + init=1; + + error_loc(((pr!=109)&&(pr!=202)&&(pr!=397))|| + (ir<0)||(ir>11)||(jr<0)||(jr>11)||(jr!=((ir+7)%12))|| + (is<0)||(is>95),1, + "rlxs_reset [ranlxs.c]","Unexpected input data"); +} + +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/ratfcts/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/ratfcts/README new file mode 100644 index 0000000000000000000000000000000000000000..fcdfa4dddc12583226bbfd726ff5dd19726d2b8d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/ratfcts/README @@ -0,0 +1,46 @@ + +******************************************************************************** + + Rational functions + +******************************************************************************** + + +Files +----- + +elliptic.c Computation of the Jacobi elliptic functions sn, cn + and dn + +ratfcts.c Rational function coefficients data base + +zolotarev.c Computation of the Zolotarev rational approximation + to 1/sqrt(y) + +Include file +------------ + +The file ratfcts.h defines the prototypes for all externally accessible +functions that are defined in the *.c files listed above. + + +List of functions +----------------- + +double ellipticK(double rk) + Returns the complete elliptic integral K(k) for 0<=k<1. The value + of k is to be passed through the argument rk=k/k' (see the notes). + +void sncndn(double u,double rk,double *sn,double *cn,double *dn) + Computes the Jacobi elliptic functions sn(u,k), cn(u,k), dn(u,k) + for specified real u and 0<=k<1. The value of k is to be passed + through the argument rk=k/k' (see the notes). + +ratfct_t ratfct(int *irat) + Returns a structure containing the coefficients of the rational + function specified by the integers irat[3] (see the notes). + +void zolotarev(int n,double eps,double *A,double *ar,double *delta) + Computes the amplitude A, the coefficients ar[r-1]=a_r, r=1,..,2n, + and the error delta of the Zolotarev optimal rational approximation + of degree [n,n] to the function f(y)=1/sqrt(y). diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/ratfcts/elliptic.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/ratfcts/elliptic.c new file mode 100644 index 0000000000000000000000000000000000000000..2d35557658cffccd089032723276e1f43fb17f02 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/ratfcts/elliptic.c @@ -0,0 +1,264 @@ + +/******************************************************************************* +* +* File elliptic.c +* +* Copyright (C) 2008, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Computation of the Jacobi elliptic functions sn, cn and dn +* +* The externally accessible functions are +* +* double ellipticK(double rk) +* Returns the complete elliptic integral K(k) for 0<=k<1. The value +* of k is to be passed through the argument rk=k/k' (see the notes). +* +* void sncndn(double u,double rk,double *sn,double *cn,double *dn) +* Computes the Jacobi elliptic functions sn(u,k), cn(u,k), dn(u,k) +* for specified real u and 0<=k<1. The value of k is to be passed +* through the argument rk=k/k' (see the notes). +* +* Notes: +* +* The complete elliptic integral and the Jacobi elliptic functions in the +* range -K/2<=u<=K/2 are obtained practically to machine precision. In +* particular, sn(u,k)=u+O(u^3) and cn(u,k)=1-u^2/2+O(u^4) exactly. +* +* Other values of u are first mapped to the interval 0<=u<=K/2 using the +* symmetry properties of the elliptic functions and the numerically computed +* value of K. In general this implies a loss of significance of the argument +* which propagates to the computed functions. +* +* The complete elliptic integral is obtained via the arithmetic-geometric +* mean. For small u, the Jacobi elliptic functions are calculated using +* the Taylor expansion. Elsewhere the descending Landen transformation is +* used. See +* +* M. Abramowitz, I. A. Stegun: "Handbook of mathematical functions", +* (Dover Publications, New York, 1972) +* +* for example. +* +* These methods eventually require both k and k'=sqrt(1-k*k) as input. While +* k' can be computed for given k, there can be important significance losses +* at this point if k is close to 1. On the other hand, if rk=k/k' is given, +* k and k' can be computed with negligible significance losses through +* +* k=rk/sqrt(1+rk^2), k'=1/sqrt(1+rk^2). +* +* This is why rk is chosen as input parameter in the programs in this file. +* +*******************************************************************************/ + +#define ELLIPTIC_C + +#include +#include +#include +#include +#include "utils.h" +#include "ratfcts.h" + + +static double agm(double x,double y) +{ + double px,py; + + for (;;) + { + px=x; + py=y; + + x=0.5*(px+py); + y=sqrt(px*py); + + if ((x<=y)||(x>=px)||(y<=py)) + return x; + } +} + + +double ellipticK(double rk) +{ + double x,y; + + if (rk<0.0) + { + error_loc(1,1,"ellipticK [elliptic.c]","Argument rk is out of range"); + + return 1.0; + } + + x=1.0+rk/sqrt(1.0+rk*rk); + y=1.0/(x*(1.0+rk*rk)); + + return (2.0*atan(1.0))/agm(x,y); +} + + +static double sn_small(double u,double rk) +{ + double m,u2,sn; + double s0,s2,s4,s6; + + m=(rk*rk)/(1.0+rk*rk); + + s0=1.0; + s2=-(1.0+m)/6.0; + s4=(1.0+14.0*m+m*m)/120.0; + s6=-(1.0+135.0*m*(1.0+m)+m*m*m)/5040.0; + + u2=u*u; + sn=s4+s6*u2; + sn=s2+sn*u2; + sn=s0+sn*u2; + + return sn*u; +} + + +static void sncn_limit(double u,double rk,double *sn,double *cn) +{ + double k,m,s,c,r; + + k=rk/sqrt(1.0+rk*rk); + m=k*k; + + s=sin(u); + c=cos(u); + r=0.25*m*(u-s*c); + + (*sn)=s-r*c; + (*cn)=c+r*s; +} + + +static void landen(double u,double rk,double *sn,double *cn) +{ + int n; + double k,kp,kt,ktp; + double delta,fact; + + delta=sqrt(DBL_EPSILON); + kp=1.0/sqrt(1.0+rk*rk); + k=rk*kp; + + for (n=0;k>delta;n++) + { + kt=(k*k)/((1.0+kp)*(1.0+kp)); + ktp=(2.0*sqrt(kp))/(1.0+kp); + u*=(0.5+0.5*kp); + + k=kt; + kp=ktp; + } + + sncn_limit(u,k/kp,sn,cn); + + kt=k; + ktp=kp; + + for (;n>0;n--) + { + k=(2.0*sqrt(kt))/(1.0+kt); + kp=(ktp*ktp)/((1.0+kt)*(1.0+kt)); + + fact=1.0/(1.0+kt*(*sn)*(*sn)); + (*sn)=(1.0+kt)*(*sn)*fact; + (*cn)=(*cn)*sqrt(ktp*ktp+kt*kt*(*cn)*(*cn))*fact; + + kt=k; + ktp=kp; + } +} + + +void sncndn(double u,double rk,double *sn,double *cn,double *dn) +{ + int n,flip; + double k,kp,K,delta,cd,sd,nd; + double sgn_sn,sgn_cn; + + if (rk<0.0) + { + error_loc(1,1,"sncndn [elliptic.c]","Argument rk is out of range"); + + (*sn)=0.0; + (*cn)=1.0; + (*dn)=0.0; + + return; + } + + sgn_sn=1.0; + sgn_cn=1.0; + + if (u<0.0) + { + u=-u; + sgn_sn*=-1.0; + } + + K=ellipticK(rk); + n=(int)(u/K); + u-=(double)(n)*K; + n=n%4; + + if (n==1) + { + u=K-u; + sgn_cn*=-1.0; + } + else if (n==2) + { + sgn_sn*=-1.0; + sgn_cn*=-1.0; + } + else if (n==3) + { + u=K-u; + sgn_sn*=-1.0; + } + + if ((2.0*u)<=K) + flip=0; + else + { + u=K-u; + flip=1; + } + + kp=1.0/sqrt(1.0+rk*rk); + k=rk*kp; + + delta=pow(DBL_EPSILON,0.125); + if (delta>1.0e-3) + delta=1.0e-3; + + if (fabs(u)0. The functions provided by this module +* instead approximate the function 1/|x| in a range ra<=|x|<=rb specified +* in the parameter data base. The relation between x and y is +* +* y=x^2/rb^2 +* +* and thus eps=(ra/rb)^2. +* +* The coefficients a[r], r=0,..,2*n-1, returned by the program zolotarev() +* are ordered such that +* +* a[0]>a[1]>..>a[2*n-1]>0. +* +* For any given integers k,l satisfying k>=0 and k<=l +#include +#include +#include "mpi.h" +#include "flags.h" +#include "utils.h" +#include "ratfcts.h" + +#define IRMAX 32 + +static int init=0,ns,irs,irats[IRMAX][3]; +static double *ars; +static ratfct_t rats[IRMAX]={{0,0.0,1.0,NULL,NULL,NULL,NULL}}; + + +static void init_rat(void) +{ + int ir; + + for (ir=0;ir0) + rats[ir]=rats[0]; + + irats[ir][0]=0; + irats[ir][1]=0; + irats[ir][2]=0; + } + + ns=0; + irs=0; + ars=NULL; + init=1; +} + + +static int fnd_rat(int *irat) +{ + int ir; + + for (ir=0;ir=n),1,"alloc_rat [ratfcts.c]", + "Improper coefficient range or undefined rational function"); + + if (n>ns) + { + if (ns>0) + free(ars); + ars=malloc(2*n*sizeof(*ars)); + ns=n; + } + + mu=malloc(4*np*sizeof(*mu)); + + error((ars==NULL)||(mu==NULL),1,"alloc_rat [ratfcts.c]", + "Unable to allocate coefficient arrays"); + + rats[irs].np=np; + rats[irs].mu=mu; + rats[irs].rmu=mu+np; + rats[irs].nu=mu+2*np; + rats[irs].rnu=mu+3*np; + + irats[irs][0]=irat[0]; + irats[irs][1]=irat[1]; + irats[irs][2]=irat[2]; +} + + +static void set_rat(int *irat) +{ + int n,np,k,l,i,j; + double ra,rb,pmu,pnu; + double eps,A,delta,*ar; + double *mu,*nu,*rmu,*rnu; + rat_parms_t rp; + + rp=rat_parms(irat[0]); + n=rp.degree; + k=irat[1]; + l=irat[2]; + np=l-k+1; + + ra=rp.range[0]; + rb=rp.range[1]; + eps=ra/rb; + eps=eps*eps; + + zolotarev(n,eps,&A,ars,&delta); + rats[irs].A=A/rb; + rats[irs].delta=delta; + + ar=ars+2*k; + mu=rats[irs].mu; + nu=rats[irs].nu; + rmu=rats[irs].rmu; + rnu=rats[irs].rnu; + + for (i=0;i +#include +#include +#include "utils.h" +#include "ratfcts.h" + + +void zolotarev(int n,double eps,double *A,double *ar,double *delta) +{ + int r; + double v,k,rk,d,s; + double sn,cn,dn,snx,cnx,dnx; + + if ((n<1)||(eps<=0.0)||(eps>=1.0)) + { + error_loc(1,1,"zolotarev [zolotarev.c]","Arguments are out of range"); + + (*A)=1.0; + (*delta)=1.0; + + return; + } + + k=sqrt(1.0-eps); + rk=k/sqrt(eps); + v=ellipticK(rk)/(double)(2*n+1); + + (*A)=1.0; + d=k; + + for (r=1;r<=(2*n);r++) + { + if (r<=n) + { + sncndn((double)(r)*v,rk,&sn,&cn,&dn); + ar[r-1]=(cn*cn)/(sn*sn); + } + else + { + sncndn((double)(2*n+1-r)*v,rk,&snx,&cnx,&dnx); + ar[r-1]=eps*((snx*snx)/(cnx*cnx)); + sn=cnx/dnx; + } + + s=sn*sn; + + if ((r%2)==0) + (*A)/=s; + else + { + (*A)*=s; + s*=k; + d*=(s*s); + } + } + + s=1.0+sqrt(1.0-d*d); + (*A)*=(2.0/s); + (*delta)=(d*d)/(s*s); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sap/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sap/README new file mode 100644 index 0000000000000000000000000000000000000000..13148190d9a601de780c1e9abb81132aa7a3e29e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sap/README @@ -0,0 +1,73 @@ + +******************************************************************************** + + Schwarz Alternating Procedure (SAP) + +******************************************************************************** + + +Files +----- + +blk_solv.c Solution of the Dirac equation on the blocks of the + SAP_BLOCKS grid. + +sap.c Multiplicative alternating Schwarz procedure for the + solution of the Wilson-Dirac equation. + +sap_com.c SAP communication program. + +sap_gcr.c SAP+GCR solver for the Wilson-Dirac equation. + + +Include file +------------ + +The file sap.h defines the prototypes for all externally accessible +functions that are defined in the *.c files listed above. + + +List of functions +----------------- + +void blk_mres(int n,float mu,int nmr) + Depending on whether the twisted-mass flag is set or not, this + program approximately solves (Dw+i*mu*gamma_5*1e)*b.s[0]=b.s[1] or + (Dw+i*mu*gamma_5)*b.s[0]=b.s[1] on the n'th block b of the SAP_BLOCKS + grid. The solution is obtained by applying nmr minimal residual steps, + using b.s[2] as workspace. On exit, the approximate solution and its + residue are in b.s[0] and b.s[1], respectively. + +void blk_eo_mres(int n,float mu,int nmr) + Approximate solution of (Dwhat+i*mu*gamma_5)*b.s[0]=b.s[1] for given + b.s[1] on the n'th block b of the SAP_BLOCKS grid. The solution is + obtained by applying nmr minimal residual steps, using b.s[2] as + workspace. On exit, the approximate solution and its residue are in + b.s[0] and b.s[1], respectively, while b.s[0],b.s[1] and b.s[2] are + unchanged on the odd points. + +void sap(float mu,int isolv,int nmr,spinor *psi,spinor *rho) + Application of one cycle of the multiplicative Schwarz procedure to + the approximate solution psi of the Wilson-Dirac equation, assuming + the associated residue is stored in the field rho (see the notes). The + block Dirac equation is solved using nmr iterations of the ordinary + (isolv=0) or the even-odd preconditioned (isolv=1) minimal residual + algorithm. On exit, the new approximate solution and its residue are + returned in the fields psi and rho. + +void alloc_sap_bufs(void) + Allocates and initializes the buffers and index arrays needed for + the program sap_com(). + + void sap_com(int ic,spinor *r) + Subtracts the Weyl field b.bb.w[0] on the boundaries of all black + (if ic=0) or all white (if ic=1) blocks b of the SAP_BLOCKS grid + from the global spinor field r. Before subtraction, the Weyl fields + on the block faces in direction ifc are expanded to Dirac spinor + fields s satisfying theta[ifc]*s=0. + +double sap_gcr(int nkv,int nmx,double res,double mu, + spinor_dble *eta,spinor_dble *psi,int *status) + Obtains an approximate solution psi of the Wilson-Dirac equation for + given source eta using the SAP-preconditioned GCR algorithm. See the + notes for the explanation of the parameters of the program. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sap/README.sap_com b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sap/README.sap_com new file mode 100644 index 0000000000000000000000000000000000000000..a5d967cb7a1a1414de18fd0cd76a5192169162ef --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sap/README.sap_com @@ -0,0 +1,153 @@ + +******************************************************************************** + + SAP communication program + +******************************************************************************** + +The application of the Schwarz Alternating Procedure (SAP) as a preconditioner +for the Wilson-Dirac operator in lattice QCD is described in + + M. Luescher: "Solution of the Dirac equation in lattice QCD using a domain + decomposition method", Comp. Phys. Commun. 156 (2004) 209. + +The SAP approximately solves the Dirac equation by running through the blocks +in the SAP_BLOCKS block grid. On each block, the Dirac equation is solved, to +some accuracy, on the interior points of the block using an iterative method +such as the minimal residual algorithm. The current solution on the full +lattice is then updated and the algorithm proceeds to the next block. + +In practice the solution is updated on all black blocks simultaneously and +subsequently on all white blocks. Communications are then required after all +blocks of given colour are processed. It is important that the communication +is done efficiently. The programs in the module sap_com.c achieve this goal +using an adapted layout of the field arrays and non-blocking communications. + + +Block boundary fields +--------------------- + +Once the Dirac equation is solved on all black (ic=0) [or all white (ic=1)] +blocks, the approximate solution and its residue on the full lattice must be +updated. In particular, the residue receives a correction at the exterior +boundaries of the blocks. The correction amounts to subtracting a Weyl field +on the block boundaries from the residue (see below). However, before the +correction can be applied, the Weyl field on the block faces that are not +contained in the local lattice must be copied to the neighbouring MPI +processes. + +The block faces in the -0,+0,..,-3,+3 direction are labeled by an index +ifc=0,..,7. It is advantageous to organize the field communications in such a +way that the fields on the block faces with fixed index ifc are processed +together. In memory the Weyl fields are therefore arranged in two arrays + + weyl snd_buf[2][8][], + + weyl loc_buf[2][8][], + +where the first index is the colour ic of the blocks, the second the face +index ifc and the third a point index. For a given colour ic and a given face +index ifc, the Weyl fields collected in the array snd_buf[ic][ifc] are those +on the faces b.bb[ifc] of the blocks b with colour ic where b.bb[ifc].ibn=1. +Similarly the fields in the array loc_buf[ic][ifc] are those on the faces +where b.bb[ifc].ibn=0. Within each of the arrays, the fields are ordered in +block order and the Weyl spinors in each block segment are ordered according +to the block geometry arrays (see block/README.block). + + +Communication +------------- + +After solving the Dirac equation on all blocks of a given colour ic, the +buffers snd_buf[ic][ifc], ifc=0,..,7, need to be sent to the nodes with rank +npr[ifc]. The data sent are received from the nodes with rank npr[ifc^0x1] +and are stored in the [ic][ifc] components of the array + + weyl rcv_buf[2][8][]. + +Note that snd_buf[ic][ifc] has the same size on all MPI processes (and, +consequently, the same size as rcv_buf[ic][ifc]). Translation invariance +implies this to be so if the number of blocks on the local lattice in +direction ifc is even. In the other case, there must be an even number of +blocks touching the face of the local lattice with index ifc, because the +total number of blocks in the local lattice is even. Half of these blocks have +colour ic and the size of snd_buf[ic][ifc] is then again independent of the +rank of the process. + +As explained in main/README.global, the MPI processes form a hypercubic grid. +In this grid, each process has Cartesian coordinates cpr[mu] (mu=0,1,2,3). One +can then define the parity bits + + np=(cpr[0]+cpr[1]+cpr[2]+cpr[3])&0x1, + + nmu[ifc]=cpr[ifc/2]&0x1, + +and first perform the communication from the np=0 to the np=1 nodes according +to + + np=0 nodes np=1 nodes + + io=ifc^nmu[ifc] io=(ifc^nmu[ifc])^0x1 + snd_buf[ic][io] -> npr[io] rcv_buf[ic][io] <- npr[io^0x1] + +The communication from the np=1 to the np=0 nodes is then performed according +to + + np=0 nodes np=1 nodes + + io=(ifc^nmu[ifc])^0x1 io=ifc^nmu[ifc] + rcv_buf[ic][io] <- npr[io^0x1] snd_buf[ic][io] -> npr[io] + +The send and receive buffers are properly paired in both cases. Moreover, +in each case, the size of the buffers communicated is the same on all nodes. +All nodes thus have exactly the same communication load. + +This pattern is such that the communication proceeds, in each case, across the +hyperplanes orthogonal to the direction ifc, the planes being separated by +2x(local lattice size in that direction). If boundary conditions of type 0,1 +or 2 are chosen, no communications across the boundaries of the lattice at +time 0 and NPROC0*L0-1 are performed. The chosen scheme allows these to be +easily omitted. + + +Subtraction from the residue +---------------------------- + +After communicating the Weyl fields, the fields to be subtracted from the +residue are contained in the arrays loc_buf[ic][ifc] and rcv_buf[ic][ifc]. For +any given colour index ic and face index ifc, these two arrays come one after +the other in memory so that one has in fact a single array of Weyl spinors +with address loc_buf[ic][ifc]. + +The Weyl spinors w on the block boundaries are the first two components +of the Dirac spinors s obtained by applying the block boundary part of +the Dirac operator to a field on the block (see dirac/Dw_bnd.c). Since + + theta[ifc]*s=0 + +where + + theta[ifc] = (1/2)*(1+gamma_mu) if ifc=2*mu, + + = (1/2)*(1-gamma_mu) if ifc=2*mu+1, + +the knowledge of w allows s to be reconstructed uniquely. The reconstruction +of the Dirac spinors is done on the fly by the program + + sub_assign_w2s[ifc^0x1](imb[ic][ifc],nlbf[ic][ifc]+nsbf[ic][ifc], + loc_buf[ic][ifc],res) + +which then subtracts the spinors from the residue field res on the full lattice +(see sflds/Pbnd.c). The other parameters in this function call are: + + nlbf[ic][ifc] Number of elements of the buffer loc_buf[ic][ifc], + + nsbf[ic][ifc] Number of elements of the buffer snd_buf[ic][ifc] + (= number of elements of rcv_buf[ic][ifc]), + + imb[ic][ifc][] Array of the indices of the points in the local lattice + where the reconstructed spinors are to be subtracted + from the residue. + +The index array imb[ic][ifc] is calculated and stored in the static memory of +the module sap_com.c when the communication buffers are allocated. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sap/blk_solv.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sap/blk_solv.c new file mode 100644 index 0000000000000000000000000000000000000000..3c59fbdfc1f949993de5950c81357c4b4823a2c8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sap/blk_solv.c @@ -0,0 +1,1026 @@ + +/******************************************************************************* +* +* File blk_solv.c +* +* Copyright (C) 2005, 2011, 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Solution of the Dirac equation on the blocks of the SAP_BLOCKS grid +* +* The externally accessible functions are +* +* void blk_mres(int n,float mu,int nmr) +* Depending on whether the twisted-mass flag is set or not, this +* program approximately solves (Dw+i*mu*gamma_5*1e)*b.s[0]=b.s[1] or +* (Dw+i*mu*gamma_5)*b.s[0]=b.s[1] on the n'th block b of the SAP_BLOCKS +* grid. The solution is obtained by applying nmr minimal residual steps, +* using b.s[2] as workspace. On exit, the approximate solution and its +* residue are in b.s[0] and b.s[1], respectively. +* +* void blk_eo_mres(int n,float mu,int nmr) +* Approximate solution of (Dwhat+i*mu*gamma_5)*b.s[0]=b.s[1] for given +* b.s[1] on the n'th block b of the SAP_BLOCKS grid. The solution is +* obtained by applying nmr minimal residual steps, using b.s[2] as +* workspace. On exit, the approximate solution and its residue are in +* b.s[0] and b.s[1], respectively, while b.s[0],b.s[1] and b.s[2] are +* unchanged on the odd points. +* +* Notes: +* +* The twisted-mass flag is retrieved from the parameter data base (see +* flags/lat_parms.c). These programs do not perform any communications and +* can be called locally. It is taken for granted that the SAP_BLOCKS grid +* is allocated and that the gauge field and the SW term on the blocks are +* in the proper condition. +* +*******************************************************************************/ + +#define BLK_SOLV_C + +#include +#include +#include +#include +#include "su3.h" +#include "utils.h" +#include "sflds.h" +#include "linalg.h" +#include "block.h" +#include "dirac.h" +#include "sap.h" + +static int vol; +static spinor **s; + +#if (defined x64) +#include "sse2.h" + +#if (defined AVX) +#include "avx.h" + +static float unity=1.0f; + + +static void scalar_prods(float *r,complex *z) +{ + spinor *s1,*s2,*sm; + + __asm__ __volatile__ ("vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" + "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" + "vxorpd %%ymm14, %%ymm14, %%ymm14" + : + : + : + "xmm12", "xmm13", "xmm14"); + + s1=s[1]; + s2=s[2]; + sm=s1+vol; + + for (;s1 +#include +#include +#include +#include "su3.h" +#include "flags.h" +#include "block.h" +#include "dirac.h" +#include "sap.h" +#include "global.h" + +static int vol; +static spinor **s; + +#if (defined AVX) +#include "avx.h" + +static void update_flds0(int *imb,spinor *psi,spinor *rho) +{ + spinor *sb,*rb,*sm; + spinor *sl,*rl,*sln; + + sb=s[0]; + rb=s[1]; + sm=sb+vol; + sln=psi+imb[0]; + + for (;sb +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "sflds.h" +#include "block.h" +#include "sap.h" +#include "global.h" + +static int nb,nbh,isw,init=0; +static int bc,np,nmu[8],sflg[8]; +static int nsbf[2][8],nlbf[2][8],*imb[2][8]; +static weyl *snd_buf[2][8],*loc_buf[2][8],*rcv_buf[2][8]; +static const weyl w0={{{0.0f}}}; +static block_t *b0; +static MPI_Request snd_req[2][8],rcv_req[2][8]; + + +static void set_nbf(void) +{ + int ifc,ibu,ibd; + int *bo,*bs; + block_t *b,*bm; + bndry_t *bb; + + bc=bc_type(); + np=(cpr[0]+cpr[1]+cpr[2]+cpr[3])&0x1; + + bs=(*b0).bs; + ibu=((cpr[0]==(NPROC0-1))&&(bc!=3)); + ibd=((cpr[0]==0)&&(bc!=3)); + + for (ifc=0;ifc<8;ifc++) + { + nmu[ifc]=cpr[ifc/2]&0x1; + sflg[ifc]=((ifc>1)|| + ((ifc==0)&&(cpr[0]!=0))|| + ((ifc==1)&&(cpr[0]!=(NPROC0-1)))|| + (bc==3)); + + nlbf[0][ifc]=0; + nsbf[0][ifc]=0; + nlbf[1][ifc]=0; + nsbf[1][ifc]=0; + + b=b0; + bm=b+nbh; + + for (;b=-1. Otherwise the field psi is set to zero and the +* program returns the norm of the source eta. +* +* The SAP_BLOCKS blocks grid is automatically allocated and the SW term is +* recalculated when needed. The gauge and SW fields are then copied to the +* block grid if they are not in the proper condition. +* +* Evidently the SAP+GCR solver is a global program that must be called on +* all processes simultaneously. The required workspaces are +* +* spinor 2*nkv+1 +* spinor_dble 2 +* +* (see utils/wspace.c). +* +*******************************************************************************/ + +#define SAP_GCR_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "sflds.h" +#include "linalg.h" +#include "block.h" +#include "sw_term.h" +#include "dirac.h" +#include "linsolv.h" +#include "sap.h" +#include "global.h" + +static double mud; +static sap_parms_t spr; + + +static void Dop(spinor_dble *s,spinor_dble *r) +{ + Dw_dble(mud,s,r); +} + + +static void Mop(int k,spinor *rho,spinor *phi,spinor *chi) +{ + int n; + + set_s2zero(VOLUME,phi); + assign_s2s(VOLUME,rho,chi); + + for (n=0;n +#include +#include +#include +#include "su3.h" +#include "sflds.h" + +#if (defined x64) +#include "sse2.h" + +#define _load_cst(c) \ +__asm__ __volatile__ ("movss %0, %%xmm15 \n\t" \ + "shufps $0x0, %%xmm15, %%xmm15" \ + : \ + : \ + "m" (c) \ + : \ + "xmm15") + +#define _mul_cst() \ +__asm__ __volatile__ ("mulps %%xmm15, %%xmm0 \n\t" \ + "mulps %%xmm15, %%xmm1 \n\t" \ + "mulps %%xmm15, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +static const float poh=0.5f; + + +static void assign_s2w0(int *imb,int vol,spinor *s,weyl *r) +{ + weyl *rm; + spinor *si,*sin; + + _load_cst(poh); + rm=r+vol; + si=s+(*imb); + imb+=(r<(rm-1)); + sin=s+(*imb); + + for (;r +#include +#include +#include +#include "su3.h" +#include "sflds.h" + +#if (defined x64) +#include "sse2.h" + +static const sse_double poh={0.5,0.5}; + + +static void assign_sd2wd0(int *imb,int vol,spinor_dble *sd,weyl_dble *rd) +{ + weyl_dble *rm; + spinor_dble *si,*sin; + + rm=rd+vol; + si=sd+(*imb); + imb+=(rd<(rm-1)); + sin=sd+(*imb); + + for (;rd +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "global.h" + +static int bc,np,nmu[8],nbf[8],ofs[8]; +static int ns,sfc[8],rfc[8],sflg[8]; +static int itags=0,tags[8]; +static weyl *wb=NULL,*snd_buf[8],*rcv_buf[8]; +static const weyl w0={{{0.0f}}}; +static MPI_Request snd_req[8],rcv_req[8]; + + +static void get_tags(void) +{ + int i; + + if (itags==0) + { + for (i=0;i<8;i++) + tags[i]=mpi_permanent_tag(); + + itags=1; + } +} + + +static void alloc_sbufs(void) +{ + int n,ifc,tag,saddr,raddr; + weyl *w,*wm; + + error(iup[0][0]==0,1,"alloc_sbufs [scom.c]", + "Geometry arrays are not set"); + + wb=amalloc(BNDRY*sizeof(*wb),ALIGN); + error(wb==NULL,1,"alloc_sbufs [scom.c]", + "Unable to allocate communication buffers"); + + w=wb; + wm=wb+BNDRY; + + for (;w0) + ofs[ifc]=ofs[ifc-1]+nbf[ifc-1]; + + if (nbf[ifc]>0) + { + sfc[ns]=ifc; + ns+=1; + + snd_buf[ifc]=w; + w+=nbf[ifc]; + rcv_buf[ifc]=w; + w+=nbf[ifc]; + + tag=tags[ifc]; + saddr=npr[ifc]; + raddr=npr[ifc^0x1]; + + MPI_Send_init(snd_buf[ifc],12*nbf[ifc],MPI_FLOAT,saddr, + tag,MPI_COMM_WORLD,&snd_req[ifc]); + MPI_Recv_init(rcv_buf[ifc],12*nbf[ifc],MPI_FLOAT,raddr, + tag,MPI_COMM_WORLD,&rcv_req[ifc]); + } + + sflg[ifc]=((ifc>1)|| + ((ifc==0)&&(cpr[0]!=0))|| + ((ifc==1)&&(cpr[0]!=(NPROC0-1)))|| + (bc==3)); + } + + for (n=0;n0) + send_bufs(sfc[m],eo); + + ifc=sfc[n]; + io=ifc^nmu[ifc]; + + if (sflg[io]) + assign_s2w[io^is](map+ofs[io^0x1],nbf[io],s,snd_buf[io]); + else + bnd_s2zero(EVEN_PTS,s); + + if (n>0) + { + wait_bufs(sfc[m],eo); + m+=eo; + eo^=0x1; + } + } + + for (n=0;n<2;n++) + { + send_bufs(sfc[m],eo); + wait_bufs(sfc[m],eo); + m+=eo; + eo^=0x1; + } + + for (n=0;n0) + send_bufs(rfc[m],eo); + + ifc=rfc[n]; + io=ifc^nmu[ifc]; + + if (sflg[io]) + zip_weyl(nbf[io],sb+ofs[io],snd_buf[io]); + + if (n>0) + { + wait_bufs(rfc[m],eo); + m+=eo; + eo^=0x1; + } + } + + for (n=0;n<2;n++) + { + send_bufs(rfc[m],eo); + wait_bufs(rfc[m],eo); + m+=eo; + eo^=0x1; + } + + for (n=0;n +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "global.h" + +static int bc,np,nmu[8],nbf[8],ofs[8]; +static int ns,sfc[8],rfc[8],sflg[8]; +static int itags=0,tags[8]; +static weyl_dble *wb=NULL,*snd_buf[8],*rcv_buf[8]; +static const weyl_dble w0={{{0.0}}}; +static MPI_Request snd_req[8],rcv_req[8]; + + +static void get_tags(void) +{ + int i; + + if (itags==0) + { + for (i=0;i<8;i++) + tags[i]=mpi_permanent_tag(); + + itags=1; + } +} + + +static void alloc_sdbufs(void) +{ + int n,ifc,tag,saddr,raddr; + weyl_dble *w,*wm; + + error(iup[0][0]==0,1,"alloc_sdbufs [sdcom.c]", + "Geometry arrays are not initialized"); + + wb=amalloc(BNDRY*sizeof(*wb),ALIGN); + error(wb==NULL,1,"alloc_sdbufs [sdcom.c]", + "Unable to allocate communication buffers"); + + w=wb; + wm=wb+BNDRY; + + for (;w0) + ofs[ifc]=ofs[ifc-1]+nbf[ifc-1]; + + if (nbf[ifc]>0) + { + sfc[ns]=ifc; + ns+=1; + + snd_buf[ifc]=w; + w+=nbf[ifc]; + rcv_buf[ifc]=w; + w+=nbf[ifc]; + + tag=tags[ifc]; + saddr=npr[ifc]; + raddr=npr[ifc^0x1]; + + MPI_Send_init((double*)(snd_buf[ifc]),12*nbf[ifc],MPI_DOUBLE,saddr, + tag,MPI_COMM_WORLD,&snd_req[ifc]); + MPI_Recv_init((double*)(rcv_buf[ifc]),12*nbf[ifc],MPI_DOUBLE,raddr, + tag,MPI_COMM_WORLD,&rcv_req[ifc]); + } + + sflg[ifc]=((ifc>1)|| + ((ifc==0)&&(cpr[0]!=0))|| + ((ifc==1)&&(cpr[0]!=(NPROC0-1)))|| + (bc==3)); + } + + for (n=0;n0) + send_bufs(sfc[m],eo); + + ifc=sfc[n]; + io=ifc^nmu[ifc]; + + if (sflg[io]) + assign_sd2wd[io^is](map+ofs[io^0x1],nbf[io],sd,snd_buf[io]); + else + bnd_sd2zero(EVEN_PTS,sd); + + if (n>0) + { + wait_bufs(sfc[m],eo); + m+=eo; + eo^=0x1; + } + } + + for (n=0;n<2;n++) + { + send_bufs(sfc[m],eo); + wait_bufs(sfc[m],eo); + m+=eo; + eo^=0x1; + } + + for (n=0;n0) + send_bufs(rfc[m],eo); + + ifc=rfc[n]; + io=ifc^nmu[ifc]; + + if (sflg[io]) + zip_weyl(nbf[io],sdb+ofs[io],snd_buf[io]); + + if (n>0) + { + wait_bufs(rfc[m],eo); + m+=eo; + eo^=0x1; + } + } + + for (n=0;n<2;n++) + { + send_bufs(rfc[m],eo); + wait_bufs(rfc[m],eo); + m+=eo; + eo^=0x1; + } + + for (n=0;n +#include +#include +#include "su3.h" +#include "random.h" +#include "sflds.h" + +#if (defined x64) +#include "sse2.h" + +void set_s2zero(int vol,spinor *s) +{ + spinor *sm; + + __asm__ __volatile__ ("xorps %%xmm0, %%xmm0 \n\t" + "xorps %%xmm1, %%xmm1 \n\t" + "xorps %%xmm2, %%xmm2" + : + : + : + "xmm0", "xmm1", "xmm2"); + + sm=s+vol; + + for (;s +#include +#include +#include +#include "utils.h" +#include "su3.h" +#include "su3fcts.h" + +#ifndef ALIGN +#define ALIGN 6 +#endif + +static void mapX2v(su3_alg_dble *X); +static void eval_td(su3_alg_dble *X); +static void ch_init(void); + +static int N,init_flag=0; +static double *c,t,d; +static su3_vector_dble v1,v2,v3,w ALIGNED16; +static su3_dble umat1,umat2 ALIGNED16; +static su3_alg_dble Y ALIGNED16; +static ch_drv0_t ALIGNED16 s; +static const ch_drv0_t sp0 ALIGNED16 ={0.0}; +static const ch_drv1_t sp1 ALIGNED16 ={0.0}; +static const ch_drv2_t sp2 ALIGNED16 ={0.0}; + + +static void eval_td(su3_alg_dble *X) +{ + t=3.0*((*X).c1*(*X).c1+(*X).c2*(*X).c2-(*X).c1*(*X).c2)+ + (*X).c3*(*X).c3+(*X).c4*(*X).c4+(*X).c5*(*X).c5+ + (*X).c6*(*X).c6+(*X).c7*(*X).c7+(*X).c8*(*X).c8; + + mapX2v(X); + _vector_cross_prod(w,v2,v3); + d=_vector_prod_im(v1,w); + + error_loc(fabs(d)>(1.000001*(1.000002-t)),1,"eval_td [chexp.c]", + "The norm of X is larger than 1"); +} + + +static void ch_init(void) +{ + int k; + double fctr; + + N=7; + fctr=1.0; + + while (fctr>DBL_EPSILON) + { + N++; + fctr/=(double)(N-7); + } + + N+=(N%2); + c=amalloc((N+1)*sizeof(*c),ALIGN); + + if (error_loc(c==NULL,1,"ch_init [chexp.c]", + "Unable to allocate auxiliary array")==0) + { + c[0]=1.0; + for (k=0;k3.0) + { + nfrb*=0.25; + eps*=0.5; + n++; + } + + Y.c1=eps*(*X).c1; + Y.c2=eps*(*X).c2; + Y.c3=eps*(*X).c3; + Y.c4=eps*(*X).c4; + Y.c5=eps*(*X).c5; + Y.c6=eps*(*X).c6; + Y.c7=eps*(*X).c7; + Y.c8=eps*(*X).c8; + + u1=&umat1; + u2=&umat2; + + chexp_drv0(&Y,&s); + ch2mat(s.p,&Y,u2); + + for (k=0;k0;n-=2) + { + __asm__ __volatile__("movapd %%xmm2, %%xmm4 \n\t" + "mulpd %%xmm6, %%xmm2 \n\t" + "shufpd $0x1, %%xmm4, %%xmm4 \n\t" + "addpd %%xmm0, %%xmm2\n\t" + "mulpd %%xmm7, %%xmm4 \n\t" + "movapd %%xmm1, %%xmm0 \n\t" + "addsd %0, %%xmm4